1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "tree-pass.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
51 #include "alloc-pool.h"
53 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
54 slp_tree
, stmt_vector_for_cost
*);
56 static object_allocator
<_slp_tree
> *slp_tree_pool
;
57 static slp_tree slp_first_node
;
62 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
68 while (slp_first_node
)
69 delete slp_first_node
;
75 _slp_tree::operator new (size_t n
)
77 gcc_assert (n
== sizeof (_slp_tree
));
78 return slp_tree_pool
->allocate_raw ();
82 _slp_tree::operator delete (void *node
, size_t n
)
84 gcc_assert (n
== sizeof (_slp_tree
));
85 slp_tree_pool
->remove_raw (node
);
89 /* Initialize a SLP node. */
91 _slp_tree::_slp_tree ()
93 this->prev_node
= NULL
;
95 slp_first_node
->prev_node
= this;
96 this->next_node
= slp_first_node
;
97 slp_first_node
= this;
98 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
99 SLP_TREE_SCALAR_OPS (this) = vNULL
;
100 SLP_TREE_VEC_STMTS (this) = vNULL
;
101 SLP_TREE_VEC_DEFS (this) = vNULL
;
102 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
103 SLP_TREE_CHILDREN (this) = vNULL
;
104 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
105 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
106 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
107 SLP_TREE_CODE (this) = ERROR_MARK
;
108 SLP_TREE_VECTYPE (this) = NULL_TREE
;
109 SLP_TREE_REPRESENTATIVE (this) = NULL
;
110 SLP_TREE_REF_COUNT (this) = 1;
111 this->max_nunits
= 1;
115 /* Tear down a SLP node. */
117 _slp_tree::~_slp_tree ()
120 this->prev_node
->next_node
= this->next_node
;
122 slp_first_node
= this->next_node
;
124 this->next_node
->prev_node
= this->prev_node
;
125 SLP_TREE_CHILDREN (this).release ();
126 SLP_TREE_SCALAR_STMTS (this).release ();
127 SLP_TREE_SCALAR_OPS (this).release ();
128 SLP_TREE_VEC_STMTS (this).release ();
129 SLP_TREE_VEC_DEFS (this).release ();
130 SLP_TREE_LOAD_PERMUTATION (this).release ();
131 SLP_TREE_LANE_PERMUTATION (this).release ();
134 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
137 vect_free_slp_tree (slp_tree node
)
142 if (--SLP_TREE_REF_COUNT (node
) != 0)
145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
147 vect_free_slp_tree (child
);
152 /* Return a location suitable for dumpings related to the SLP instance. */
155 _slp_instance::location () const
158 return root_stmt
->stmt
;
160 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
164 /* Free the memory allocated for the SLP instance. */
167 vect_free_slp_instance (slp_instance instance
)
169 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
170 SLP_INSTANCE_LOADS (instance
).release ();
171 instance
->subgraph_entries
.release ();
172 instance
->cost_vec
.release ();
177 /* Create an SLP node for SCALAR_STMTS. */
180 vect_create_new_slp_node (unsigned nops
, tree_code code
)
182 slp_tree node
= new _slp_tree
;
183 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
184 SLP_TREE_CHILDREN (node
).create (nops
);
185 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
186 SLP_TREE_CODE (node
) = code
;
189 /* Create an SLP node for SCALAR_STMTS. */
192 vect_create_new_slp_node (slp_tree node
,
193 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
195 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
196 SLP_TREE_CHILDREN (node
).create (nops
);
197 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
198 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
199 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
203 /* Create an SLP node for SCALAR_STMTS. */
206 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
208 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
211 /* Create an SLP node for OPS. */
214 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
216 SLP_TREE_SCALAR_OPS (node
) = ops
;
217 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
218 SLP_TREE_LANES (node
) = ops
.length ();
222 /* Create an SLP node for OPS. */
225 vect_create_new_slp_node (vec
<tree
> ops
)
227 return vect_create_new_slp_node (new _slp_tree
, ops
);
231 /* This structure is used in creation of an SLP tree. Each instance
232 corresponds to the same operand in a group of scalar stmts in an SLP
234 typedef struct _slp_oprnd_info
236 /* Def-stmts for the operands. */
237 vec
<stmt_vec_info
> def_stmts
;
240 /* Information about the first statement, its vector def-type, type, the
241 operand itself in case it's constant, and an indication if it's a pattern
244 enum vect_def_type first_dt
;
249 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
251 static vec
<slp_oprnd_info
>
252 vect_create_oprnd_info (int nops
, int group_size
)
255 slp_oprnd_info oprnd_info
;
256 vec
<slp_oprnd_info
> oprnds_info
;
258 oprnds_info
.create (nops
);
259 for (i
= 0; i
< nops
; i
++)
261 oprnd_info
= XNEW (struct _slp_oprnd_info
);
262 oprnd_info
->def_stmts
.create (group_size
);
263 oprnd_info
->ops
.create (group_size
);
264 oprnd_info
->first_dt
= vect_uninitialized_def
;
265 oprnd_info
->first_op_type
= NULL_TREE
;
266 oprnd_info
->any_pattern
= false;
267 oprnds_info
.quick_push (oprnd_info
);
274 /* Free operands info. */
277 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
280 slp_oprnd_info oprnd_info
;
282 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
284 oprnd_info
->def_stmts
.release ();
285 oprnd_info
->ops
.release ();
286 XDELETE (oprnd_info
);
289 oprnds_info
.release ();
293 /* Return true if STMTS contains a pattern statement. */
296 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
298 stmt_vec_info stmt_info
;
300 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
301 if (is_pattern_stmt_p (stmt_info
))
306 /* Return true when all lanes in the external or constant NODE have
310 vect_slp_tree_uniform_p (slp_tree node
)
312 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
313 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
315 /* Pre-exsting vectors. */
316 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
320 tree op
, first
= NULL_TREE
;
321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
324 else if (!operand_equal_p (first
, op
, 0))
330 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
331 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
335 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
336 stmt_vec_info first_stmt_info
)
338 stmt_vec_info next_stmt_info
= first_stmt_info
;
341 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
346 if (next_stmt_info
== stmt_info
)
348 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
350 result
+= DR_GROUP_GAP (next_stmt_info
);
352 while (next_stmt_info
);
357 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
358 using the method implemented by duplicate_and_interleave. Return true
359 if so, returning the number of intermediate vectors in *NVECTORS_OUT
360 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
364 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
365 tree elt_type
, unsigned int *nvectors_out
,
366 tree
*vector_type_out
,
369 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
370 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
373 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
374 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
375 unsigned int nvectors
= 1;
378 scalar_int_mode int_mode
;
379 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
380 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
382 /* Get the natural vector type for this SLP group size. */
383 tree int_type
= build_nonstandard_integer_type
384 (GET_MODE_BITSIZE (int_mode
), 1);
386 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
388 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
389 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
390 GET_MODE_SIZE (base_vector_mode
)))
392 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
393 together into elements of type INT_TYPE and using the result
394 to build NVECTORS vectors. */
395 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
396 vec_perm_builder
sel1 (nelts
, 2, 3);
397 vec_perm_builder
sel2 (nelts
, 2, 3);
398 poly_int64 half_nelts
= exact_div (nelts
, 2);
399 for (unsigned int i
= 0; i
< 3; ++i
)
402 sel1
.quick_push (i
+ nelts
);
403 sel2
.quick_push (half_nelts
+ i
);
404 sel2
.quick_push (half_nelts
+ i
+ nelts
);
406 vec_perm_indices
indices1 (sel1
, 2, nelts
);
407 vec_perm_indices
indices2 (sel2
, 2, nelts
);
408 if (can_vec_perm_const_p (TYPE_MODE (vector_type
), indices1
)
409 && can_vec_perm_const_p (TYPE_MODE (vector_type
), indices2
))
412 *nvectors_out
= nvectors
;
414 *vector_type_out
= vector_type
;
417 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
419 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
426 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
432 /* Return true if DTA and DTB match. */
435 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
438 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
439 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
442 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
443 they are of a valid type and that they match the defs of the first stmt of
444 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
445 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
446 indicates swap is required for cond_expr stmts. Specifically, *SWAP
447 is 1 if STMT is cond and operands of comparison need to be swapped;
448 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
449 If there is any operand swap in this function, *SWAP is set to non-zero
451 If there was a fatal error return -1; if the error could be corrected by
452 swapping operands of father node of this one, return 1; if everything is
455 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
457 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
458 vec
<slp_oprnd_info
> *oprnds_info
)
460 stmt_vec_info stmt_info
= stmts
[stmt_num
];
462 unsigned int i
, number_of_oprnds
;
463 enum vect_def_type dt
= vect_uninitialized_def
;
464 slp_oprnd_info oprnd_info
;
465 int first_op_idx
= 1;
466 unsigned int commutative_op
= -1U;
467 bool first_op_cond
= false;
468 bool first
= stmt_num
== 0;
470 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
472 number_of_oprnds
= gimple_call_num_args (stmt
);
474 if (gimple_call_internal_p (stmt
))
476 internal_fn ifn
= gimple_call_internal_fn (stmt
);
477 commutative_op
= first_commutative_argument (ifn
);
479 /* Masked load, only look at mask. */
480 if (ifn
== IFN_MASK_LOAD
)
482 number_of_oprnds
= 1;
483 /* Mask operand index. */
488 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
490 enum tree_code code
= gimple_assign_rhs_code (stmt
);
491 number_of_oprnds
= gimple_num_ops (stmt
) - 1;
492 /* Swap can only be done for cond_expr if asked to, otherwise we
493 could result in different comparison code to the first stmt. */
494 if (code
== COND_EXPR
495 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt
)))
497 first_op_cond
= true;
501 commutative_op
= commutative_tree_code (code
) ? 0U : -1U;
503 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
504 number_of_oprnds
= gimple_phi_num_args (stmt
);
508 bool swapped
= (swap
!= 0);
509 bool backedge
= false;
510 gcc_assert (!swapped
|| first_op_cond
);
511 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
512 for (i
= 0; i
< number_of_oprnds
; i
++)
516 /* Map indicating how operands of cond_expr should be swapped. */
517 int maps
[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
518 int *map
= maps
[swap
];
521 oprnd
= TREE_OPERAND (gimple_op (stmt_info
->stmt
,
522 first_op_idx
), map
[i
]);
524 oprnd
= gimple_op (stmt_info
->stmt
, map
[i
]);
526 else if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
528 oprnd
= gimple_phi_arg_def (stmt
, i
);
529 backedge
= dominated_by_p (CDI_DOMINATORS
,
530 gimple_phi_arg_edge (stmt
, i
)->src
,
531 gimple_bb (stmt_info
->stmt
));
534 oprnd
= gimple_op (stmt_info
->stmt
, first_op_idx
+ (swapped
? !i
: i
));
535 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
536 oprnd
= TREE_OPERAND (oprnd
, 0);
538 oprnd_info
= (*oprnds_info
)[i
];
540 stmt_vec_info def_stmt_info
;
541 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
543 if (dump_enabled_p ())
544 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
545 "Build SLP failed: can't analyze def for %T\n",
553 oprnd_info
->def_stmts
.quick_push (NULL
);
554 oprnd_info
->ops
.quick_push (NULL_TREE
);
555 oprnd_info
->first_dt
= vect_uninitialized_def
;
559 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
560 oprnd_info
->ops
.quick_push (oprnd
);
563 && is_pattern_stmt_p (def_stmt_info
))
565 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
567 oprnd_info
->any_pattern
= true;
569 /* If we promote this to external use the original stmt def. */
570 oprnd_info
->ops
.last ()
571 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
574 /* If there's a extern def on a backedge make sure we can
575 code-generate at the region start.
576 ??? This is another case that could be fixed by adjusting
577 how we split the function but at the moment we'd have conflicting
580 && dts
[i
] == vect_external_def
581 && is_a
<bb_vec_info
> (vinfo
)
582 && TREE_CODE (oprnd
) == SSA_NAME
583 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
584 && !dominated_by_p (CDI_DOMINATORS
,
585 as_a
<bb_vec_info
> (vinfo
)->bbs
[0],
586 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
590 "Build SLP failed: extern def %T only defined "
591 "on backedge\n", oprnd
);
597 tree type
= TREE_TYPE (oprnd
);
599 if ((dt
== vect_constant_def
600 || dt
== vect_external_def
)
601 && !GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
602 && (TREE_CODE (type
) == BOOLEAN_TYPE
603 || !can_duplicate_and_interleave_p (vinfo
, stmts
.length (),
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
608 "Build SLP failed: invalid type of def "
609 "for variable-length SLP %T\n", oprnd
);
613 /* For the swapping logic below force vect_reduction_def
614 for the reduction op in a SLP reduction group. */
615 if (!STMT_VINFO_DATA_REF (stmt_info
)
616 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
617 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
619 dts
[i
] = dt
= vect_reduction_def
;
621 /* Check the types of the definition. */
624 case vect_external_def
:
625 case vect_constant_def
:
626 case vect_internal_def
:
627 case vect_reduction_def
:
628 case vect_induction_def
:
629 case vect_nested_cycle
:
633 /* FORNOW: Not supported. */
634 if (dump_enabled_p ())
635 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
636 "Build SLP failed: illegal type of def %T\n",
641 oprnd_info
->first_dt
= dt
;
642 oprnd_info
->first_op_type
= type
;
648 /* Now match the operand definition types to that of the first stmt. */
649 for (i
= 0; i
< number_of_oprnds
;)
657 oprnd_info
= (*oprnds_info
)[i
];
659 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
660 oprnd
= oprnd_info
->ops
[stmt_num
];
661 tree type
= TREE_TYPE (oprnd
);
663 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
667 "Build SLP failed: different operand types\n");
671 /* Not first stmt of the group, check that the def-stmt/s match
672 the def-stmt/s of the first stmt. Allow different definition
673 types for reduction chains: the first stmt must be a
674 vect_reduction_def (a phi node), and the rest
675 end in the reduction chain. */
676 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
677 && !(oprnd_info
->first_dt
== vect_reduction_def
678 && !STMT_VINFO_DATA_REF (stmt_info
)
679 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
681 && !STMT_VINFO_DATA_REF (def_stmt_info
)
682 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
683 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
684 || (!STMT_VINFO_DATA_REF (stmt_info
)
685 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
687 || STMT_VINFO_DATA_REF (def_stmt_info
)
688 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
689 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
690 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
692 /* Try swapping operands if we got a mismatch. For BB
693 vectorization only in case it will clearly improve things. */
694 if (i
== commutative_op
&& !swapped
695 && (!is_a
<bb_vec_info
> (vinfo
)
696 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
698 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
699 || vect_def_types_match
700 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_NOTE
, vect_location
,
704 "trying swapped operands\n");
705 std::swap (dts
[i
], dts
[i
+1]);
706 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
707 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
708 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
709 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
714 if (is_a
<bb_vec_info
> (vinfo
)
715 && !oprnd_info
->any_pattern
)
717 /* Now for commutative ops we should see whether we can
718 make the other operand matching. */
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
721 "treating operand as external\n");
722 oprnd_info
->first_dt
= dt
= vect_external_def
;
726 if (dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
728 "Build SLP failed: different types\n");
733 /* Make sure to demote the overall operand to external. */
734 if (dt
== vect_external_def
)
735 oprnd_info
->first_dt
= vect_external_def
;
736 /* For a SLP reduction chain we want to duplicate the reduction to
737 each of the chain members. That gets us a sane SLP graph (still
738 the stmts are not 100% correct wrt the initial values). */
739 else if ((dt
== vect_internal_def
740 || dt
== vect_reduction_def
)
741 && oprnd_info
->first_dt
== vect_reduction_def
742 && !STMT_VINFO_DATA_REF (stmt_info
)
743 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
744 && !STMT_VINFO_DATA_REF (def_stmt_info
)
745 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
746 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
748 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
749 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
758 if (dump_enabled_p ())
759 dump_printf_loc (MSG_NOTE
, vect_location
,
760 "swapped operands to match def types in %G",
767 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
768 Return true if we can, meaning that this choice doesn't conflict with
769 existing SLP nodes that use STMT_INFO. */
772 vect_update_shared_vectype (stmt_vec_info stmt_info
, tree vectype
)
774 tree old_vectype
= STMT_VINFO_VECTYPE (stmt_info
);
776 return useless_type_conversion_p (vectype
, old_vectype
);
778 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
780 /* We maintain the invariant that if any statement in the group is
781 used, all other members of the group have the same vector type. */
782 stmt_vec_info first_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
783 stmt_vec_info member_info
= first_info
;
784 for (; member_info
; member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
785 if (is_pattern_stmt_p (member_info
)
786 && !useless_type_conversion_p (vectype
,
787 STMT_VINFO_VECTYPE (member_info
)))
792 for (member_info
= first_info
; member_info
;
793 member_info
= DR_GROUP_NEXT_ELEMENT (member_info
))
794 STMT_VINFO_VECTYPE (member_info
) = vectype
;
798 else if (!is_pattern_stmt_p (stmt_info
))
800 STMT_VINFO_VECTYPE (stmt_info
) = vectype
;
804 if (dump_enabled_p ())
806 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
807 "Build SLP failed: incompatible vector"
808 " types for: %G", stmt_info
->stmt
);
809 dump_printf_loc (MSG_NOTE
, vect_location
,
810 " old vector type: %T\n", old_vectype
);
811 dump_printf_loc (MSG_NOTE
, vect_location
,
812 " new vector type: %T\n", vectype
);
817 /* Return true if call statements CALL1 and CALL2 are similar enough
818 to be combined into the same SLP group. */
821 compatible_calls_p (gcall
*call1
, gcall
*call2
)
823 unsigned int nargs
= gimple_call_num_args (call1
);
824 if (nargs
!= gimple_call_num_args (call2
))
827 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
830 if (gimple_call_internal_p (call1
))
832 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
833 TREE_TYPE (gimple_call_lhs (call2
))))
835 for (unsigned int i
= 0; i
< nargs
; ++i
)
836 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
837 TREE_TYPE (gimple_call_arg (call2
, i
))))
842 if (!operand_equal_p (gimple_call_fn (call1
),
843 gimple_call_fn (call2
), 0))
846 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
852 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
853 caller's attempt to find the vector type in STMT_INFO with the narrowest
854 element type. Return true if VECTYPE is nonnull and if it is valid
855 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
856 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
857 vect_build_slp_tree. */
860 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
861 unsigned int group_size
,
862 tree vectype
, poly_uint64
*max_nunits
)
866 if (dump_enabled_p ())
867 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
868 "Build SLP failed: unsupported data-type in %G\n",
870 /* Fatal mismatch. */
874 /* If populating the vector type requires unrolling then fail
875 before adjusting *max_nunits for basic-block vectorization. */
876 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
877 unsigned HOST_WIDE_INT const_nunits
;
878 if (is_a
<bb_vec_info
> (vinfo
)
879 && (!nunits
.is_constant (&const_nunits
)
880 || const_nunits
> group_size
))
882 if (dump_enabled_p ())
883 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
884 "Build SLP failed: unrolling required "
885 "in basic block SLP\n");
886 /* Fatal mismatch. */
890 /* In case of multiple types we need to detect the smallest type. */
891 vect_update_max_nunits (max_nunits
, vectype
);
895 /* Verify if the scalar stmts STMTS are isomorphic, require data
896 permutation or are of unsupported types of operation. Return
897 true if they are, otherwise return false and indicate in *MATCHES
898 which stmts are not isomorphic to the first one. If MATCHES[0]
899 is false then this indicates the comparison could not be
900 carried out or the stmts will never be vectorized by SLP.
902 Note COND_EXPR is possibly isomorphic to another one after swapping its
903 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
904 the first stmt by swapping the two operands of comparison; set SWAP[i]
905 to 2 if stmt I is isormorphic to the first stmt by inverting the code
906 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
907 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
910 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
911 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
912 poly_uint64
*max_nunits
, bool *matches
,
913 bool *two_operators
, tree
*node_vectype
)
916 stmt_vec_info first_stmt_info
= stmts
[0];
917 enum tree_code first_stmt_code
= ERROR_MARK
;
918 enum tree_code alt_stmt_code
= ERROR_MARK
;
919 enum tree_code rhs_code
= ERROR_MARK
;
920 enum tree_code first_cond_code
= ERROR_MARK
;
922 bool need_same_oprnds
= false;
923 tree vectype
= NULL_TREE
, first_op1
= NULL_TREE
;
926 machine_mode optab_op2_mode
;
927 machine_mode vec_mode
;
928 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
929 bool first_stmt_load_p
= false, load_p
= false;
930 bool first_stmt_phi_p
= false, phi_p
= false;
932 /* For every stmt in NODE find its def stmt/s. */
933 stmt_vec_info stmt_info
;
934 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
936 gimple
*stmt
= stmt_info
->stmt
;
940 if (dump_enabled_p ())
941 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
943 /* Fail to vectorize statements marked as unvectorizable, throw
945 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
946 || stmt_can_throw_internal (cfun
, stmt
)
947 || gimple_has_volatile_ops (stmt
))
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
951 "Build SLP failed: unvectorizable statement %G",
953 /* ??? For BB vectorization we want to commutate operands in a way
954 to shuffle all unvectorizable defs into one operand and have
955 the other still vectorized. The following doesn't reliably
956 work for this though but it's the easiest we can do here. */
957 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
959 /* Fatal mismatch. */
964 lhs
= gimple_get_lhs (stmt
);
965 if (lhs
== NULL_TREE
)
967 if (dump_enabled_p ())
968 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
969 "Build SLP failed: not GIMPLE_ASSIGN nor "
970 "GIMPLE_CALL %G", stmt
);
971 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
973 /* Fatal mismatch. */
979 if (!vect_get_vector_types_for_stmt (vinfo
, stmt_info
, &vectype
,
980 &nunits_vectype
, group_size
)
982 && !vect_record_max_nunits (vinfo
, stmt_info
, group_size
,
983 nunits_vectype
, max_nunits
)))
985 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
987 /* Fatal mismatch. */
992 gcc_assert (vectype
);
994 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
997 rhs_code
= CALL_EXPR
;
999 if (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
))
1001 else if ((gimple_call_internal_p (call_stmt
)
1002 && (!vectorizable_internal_fn_p
1003 (gimple_call_internal_fn (call_stmt
))))
1004 || gimple_call_tail_p (call_stmt
)
1005 || gimple_call_noreturn_p (call_stmt
)
1006 || !gimple_call_nothrow_p (call_stmt
)
1007 || gimple_call_chain (call_stmt
))
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1011 "Build SLP failed: unsupported call type %G",
1013 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1015 /* Fatal mismatch. */
1020 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1022 rhs_code
= ERROR_MARK
;
1027 rhs_code
= gimple_assign_rhs_code (stmt
);
1028 load_p
= gimple_vuse (stmt
);
1031 /* Check the operation. */
1034 *node_vectype
= vectype
;
1035 first_stmt_code
= rhs_code
;
1036 first_stmt_load_p
= load_p
;
1037 first_stmt_phi_p
= phi_p
;
1039 /* Shift arguments should be equal in all the packed stmts for a
1040 vector shift with scalar shift operand. */
1041 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1042 || rhs_code
== LROTATE_EXPR
1043 || rhs_code
== RROTATE_EXPR
)
1045 vec_mode
= TYPE_MODE (vectype
);
1047 /* First see if we have a vector/vector shift. */
1048 optab
= optab_for_tree_code (rhs_code
, vectype
,
1052 || optab_handler (optab
, vec_mode
) == CODE_FOR_nothing
)
1054 /* No vector/vector shift, try for a vector/scalar shift. */
1055 optab
= optab_for_tree_code (rhs_code
, vectype
,
1060 if (dump_enabled_p ())
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1062 "Build SLP failed: no optab.\n");
1063 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1065 /* Fatal mismatch. */
1069 icode
= (int) optab_handler (optab
, vec_mode
);
1070 if (icode
== CODE_FOR_nothing
)
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1074 "Build SLP failed: "
1075 "op not supported by target.\n");
1076 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1078 /* Fatal mismatch. */
1082 optab_op2_mode
= insn_data
[icode
].operand
[2].mode
;
1083 if (!VECTOR_MODE_P (optab_op2_mode
))
1085 need_same_oprnds
= true;
1086 first_op1
= gimple_assign_rhs2 (stmt
);
1090 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1092 need_same_oprnds
= true;
1093 first_op1
= gimple_assign_rhs2 (stmt
);
1096 && rhs_code
== BIT_FIELD_REF
)
1098 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1099 if (!is_a
<bb_vec_info
> (vinfo
)
1100 || TREE_CODE (vec
) != SSA_NAME
1101 || !types_compatible_p (vectype
, TREE_TYPE (vec
)))
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1105 "Build SLP failed: "
1106 "BIT_FIELD_REF not supported\n");
1107 /* Fatal mismatch. */
1113 && gimple_call_internal_p (call_stmt
, IFN_DIV_POW2
))
1115 need_same_oprnds
= true;
1116 first_op1
= gimple_call_arg (call_stmt
, 1);
1121 if (first_stmt_code
!= rhs_code
1122 && alt_stmt_code
== ERROR_MARK
)
1123 alt_stmt_code
= rhs_code
;
1124 if ((first_stmt_code
!= rhs_code
1125 && (first_stmt_code
!= IMAGPART_EXPR
1126 || rhs_code
!= REALPART_EXPR
)
1127 && (first_stmt_code
!= REALPART_EXPR
1128 || rhs_code
!= IMAGPART_EXPR
)
1129 /* Handle mismatches in plus/minus by computing both
1130 and merging the results. */
1131 && !((first_stmt_code
== PLUS_EXPR
1132 || first_stmt_code
== MINUS_EXPR
)
1133 && (alt_stmt_code
== PLUS_EXPR
1134 || alt_stmt_code
== MINUS_EXPR
)
1135 && rhs_code
== alt_stmt_code
)
1136 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1137 && (first_stmt_code
== ARRAY_REF
1138 || first_stmt_code
== BIT_FIELD_REF
1139 || first_stmt_code
== INDIRECT_REF
1140 || first_stmt_code
== COMPONENT_REF
1141 || first_stmt_code
== MEM_REF
)))
1142 || first_stmt_load_p
!= load_p
1143 || first_stmt_phi_p
!= phi_p
)
1145 if (dump_enabled_p ())
1147 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1148 "Build SLP failed: different operation "
1149 "in stmt %G", stmt
);
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1151 "original stmt %G", first_stmt_info
->stmt
);
1157 if (need_same_oprnds
)
1159 tree other_op1
= (call_stmt
1160 ? gimple_call_arg (call_stmt
, 1)
1161 : gimple_assign_rhs2 (stmt
));
1162 if (!operand_equal_p (first_op1
, other_op1
, 0))
1164 if (dump_enabled_p ())
1165 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1166 "Build SLP failed: different shift "
1167 "arguments in %G", stmt
);
1173 && first_stmt_code
== BIT_FIELD_REF
1174 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1175 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1177 if (dump_enabled_p ())
1178 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1179 "Build SLP failed: different BIT_FIELD_REF "
1180 "arguments in %G", stmt
);
1185 if (!load_p
&& rhs_code
== CALL_EXPR
)
1187 if (!compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1188 as_a
<gcall
*> (stmt
)))
1190 if (dump_enabled_p ())
1191 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1192 "Build SLP failed: different calls in %G",
1200 && (gimple_bb (first_stmt_info
->stmt
)
1201 != gimple_bb (stmt_info
->stmt
)))
1203 if (dump_enabled_p ())
1204 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1205 "Build SLP failed: different BB for PHI "
1211 if (!types_compatible_p (vectype
, *node_vectype
))
1213 if (dump_enabled_p ())
1214 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1215 "Build SLP failed: different vector type "
1222 /* Grouped store or load. */
1223 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1225 if (REFERENCE_CLASS_P (lhs
))
1233 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1234 if (prev_first_load
)
1236 /* Check that there are no loads from different interleaving
1237 chains in the same node. */
1238 if (prev_first_load
!= first_load
)
1240 if (dump_enabled_p ())
1241 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1243 "Build SLP failed: different "
1244 "interleaving chains in one node %G",
1251 prev_first_load
= first_load
;
1253 } /* Grouped access. */
1258 /* Not grouped load. */
1259 if (dump_enabled_p ())
1260 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1261 "Build SLP failed: not grouped load %G", stmt
);
1263 /* FORNOW: Not grouped loads are not supported. */
1264 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1266 /* Fatal mismatch. */
1271 /* Not memory operation. */
1273 && TREE_CODE_CLASS (rhs_code
) != tcc_binary
1274 && TREE_CODE_CLASS (rhs_code
) != tcc_unary
1275 && TREE_CODE_CLASS (rhs_code
) != tcc_expression
1276 && TREE_CODE_CLASS (rhs_code
) != tcc_comparison
1277 && rhs_code
!= VIEW_CONVERT_EXPR
1278 && rhs_code
!= CALL_EXPR
1279 && rhs_code
!= BIT_FIELD_REF
)
1281 if (dump_enabled_p ())
1282 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1283 "Build SLP failed: operation unsupported %G",
1285 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1287 /* Fatal mismatch. */
1292 if (rhs_code
== COND_EXPR
)
1294 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1295 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1296 enum tree_code swap_code
= ERROR_MARK
;
1297 enum tree_code invert_code
= ERROR_MARK
;
1300 first_cond_code
= TREE_CODE (cond_expr
);
1301 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1303 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1304 swap_code
= swap_tree_comparison (cond_code
);
1305 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1308 if (first_cond_code
== cond_code
)
1310 /* Isomorphic can be achieved by swapping. */
1311 else if (first_cond_code
== swap_code
)
1313 /* Isomorphic can be achieved by inverting. */
1314 else if (first_cond_code
== invert_code
)
1318 if (dump_enabled_p ())
1319 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1320 "Build SLP failed: different"
1321 " operation %G", stmt
);
1331 for (i
= 0; i
< group_size
; ++i
)
1335 /* If we allowed a two-operation SLP node verify the target can cope
1336 with the permute we are going to use. */
1337 if (alt_stmt_code
!= ERROR_MARK
1338 && TREE_CODE_CLASS (alt_stmt_code
) != tcc_reference
)
1340 *two_operators
= true;
1346 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1347 Note we never remove apart from at destruction time so we do not
1348 need a special value for deleted that differs from empty. */
1351 typedef vec
<stmt_vec_info
> value_type
;
1352 typedef vec
<stmt_vec_info
> compare_type
;
1353 static inline hashval_t
hash (value_type
);
1354 static inline bool equal (value_type existing
, value_type candidate
);
1355 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1356 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1357 static const bool empty_zero_p
= true;
1358 static inline void mark_empty (value_type
&x
) { x
.release (); }
1359 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1360 static inline void remove (value_type
&x
) { x
.release (); }
1363 bst_traits::hash (value_type x
)
1366 for (unsigned i
= 0; i
< x
.length (); ++i
)
1367 h
.add_int (gimple_uid (x
[i
]->stmt
));
1371 bst_traits::equal (value_type existing
, value_type candidate
)
1373 if (existing
.length () != candidate
.length ())
1375 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1376 if (existing
[i
] != candidate
[i
])
1381 typedef hash_map
<vec
<gimple
*>, slp_tree
,
1382 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1383 scalar_stmts_to_slp_tree_map_t
;
1386 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1387 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1388 poly_uint64
*max_nunits
,
1389 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1390 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1393 vect_build_slp_tree (vec_info
*vinfo
,
1394 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1395 poly_uint64
*max_nunits
,
1396 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1397 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1399 if (slp_tree
*leader
= bst_map
->get (stmts
))
1401 if (dump_enabled_p ())
1402 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1403 *leader
? "" : "failed ", *leader
);
1406 SLP_TREE_REF_COUNT (*leader
)++;
1407 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1412 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1413 so we can pick up backedge destinations during discovery. */
1414 slp_tree res
= new _slp_tree
;
1415 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1416 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1417 bst_map
->put (stmts
.copy (), res
);
1421 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_NOTE
, vect_location
,
1423 "SLP discovery limit exceeded\n");
1424 bool existed_p
= bst_map
->put (stmts
, NULL
);
1425 gcc_assert (existed_p
);
1426 /* Mark the node invalid so we can detect those when still in use
1427 as backedge destinations. */
1428 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1429 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1430 vect_free_slp_tree (res
);
1431 memset (matches
, 0, sizeof (bool) * group_size
);
1436 poly_uint64 this_max_nunits
= 1;
1437 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1439 matches
, limit
, tree_size
, bst_map
);
1442 bool existed_p
= bst_map
->put (stmts
, NULL
);
1443 gcc_assert (existed_p
);
1444 /* Mark the node invalid so we can detect those when still in use
1445 as backedge destinations. */
1446 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1447 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1448 vect_free_slp_tree (res
);
1452 gcc_assert (res_
== res
);
1453 res
->max_nunits
= this_max_nunits
;
1454 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1455 /* Keep a reference for the bst_map use. */
1456 SLP_TREE_REF_COUNT (res
)++;
1461 /* Recursively build an SLP tree starting from NODE.
1462 Fail (and return a value not equal to zero) if def-stmts are not
1463 isomorphic, require data permutation or are of unsupported types of
1464 operation. Otherwise, return 0.
1465 The value returned is the depth in the SLP tree where a mismatch
1469 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1470 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1471 poly_uint64
*max_nunits
,
1472 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1473 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1475 unsigned nops
, i
, this_tree_size
= 0;
1476 poly_uint64 this_max_nunits
= *max_nunits
;
1480 stmt_vec_info stmt_info
= stmts
[0];
1481 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1482 nops
= gimple_call_num_args (stmt
);
1483 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
1485 nops
= gimple_num_ops (stmt
) - 1;
1486 if (gimple_assign_rhs_code (stmt
) == COND_EXPR
)
1489 else if (gphi
*phi
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1490 nops
= gimple_phi_num_args (phi
);
1494 /* If the SLP node is a PHI (induction or reduction), terminate
1496 bool *skip_args
= XALLOCAVEC (bool, nops
);
1497 memset (skip_args
, 0, sizeof (bool) * nops
);
1498 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1499 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1501 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1502 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1504 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1508 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1509 if (def_type
== vect_induction_def
)
1511 /* Induction PHIs are not cycles but walk the initial
1512 value. Only for inner loops through, for outer loops
1513 we need to pick up the value from the actual PHIs
1514 to more easily support peeling and epilogue vectorization. */
1515 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1516 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1517 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1520 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1522 else if (def_type
== vect_reduction_def
1523 || def_type
== vect_double_reduction_def
1524 || def_type
== vect_nested_cycle
)
1526 /* Else def types have to match. */
1527 stmt_vec_info other_info
;
1528 bool all_same
= true;
1529 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1531 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1533 if (other_info
!= stmt_info
)
1536 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1537 /* Reduction initial values are not explicitely represented. */
1538 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1539 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1540 /* Reduction chain backedge defs are filled manually.
1541 ??? Need a better way to identify a SLP reduction chain PHI.
1542 Or a better overall way to SLP match those. */
1543 if (all_same
&& def_type
== vect_reduction_def
)
1544 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1546 else if (def_type
!= vect_internal_def
)
1551 bool two_operators
= false;
1552 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1553 tree vectype
= NULL_TREE
;
1554 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
1555 &this_max_nunits
, matches
, &two_operators
,
1559 /* If the SLP node is a load, terminate the recursion unless masked. */
1560 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1561 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
1563 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
1566 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
1571 *max_nunits
= this_max_nunits
;
1573 node
= vect_create_new_slp_node (node
, stmts
, 0);
1574 SLP_TREE_VECTYPE (node
) = vectype
;
1575 /* And compute the load permutation. Whether it is actually
1576 a permutation depends on the unrolling factor which is
1578 vec
<unsigned> load_permutation
;
1580 stmt_vec_info load_info
;
1581 load_permutation
.create (group_size
);
1582 stmt_vec_info first_stmt_info
1583 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
1584 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
1586 int load_place
= vect_get_place_in_interleaving_chain
1587 (load_info
, first_stmt_info
);
1588 gcc_assert (load_place
!= -1);
1589 load_permutation
.safe_push (load_place
);
1591 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
1595 else if (gimple_assign_single_p (stmt_info
->stmt
)
1596 && !gimple_vuse (stmt_info
->stmt
)
1597 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
1599 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1600 the same SSA name vector of a compatible type to vectype. */
1601 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
1602 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
1603 stmt_vec_info estmt_info
;
1604 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
1606 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
1607 tree bfref
= gimple_assign_rhs1 (estmt
);
1609 if (!known_eq (bit_field_size (bfref
),
1610 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
1611 || !constant_multiple_p (bit_field_offset (bfref
),
1612 bit_field_size (bfref
), &lane
))
1617 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
1619 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
1620 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
1621 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
1622 /* We are always building a permutation node even if it is an identity
1623 permute to shield the rest of the vectorizer from the odd node
1624 representing an actual vector without any scalar ops.
1625 ??? We could hide it completely with making the permute node
1627 node
= vect_create_new_slp_node (node
, stmts
, 1);
1628 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1629 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
1630 SLP_TREE_VECTYPE (node
) = vectype
;
1631 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
1635 /* Get at the operands, verifying they are compatible. */
1636 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
1637 slp_oprnd_info oprnd_info
;
1638 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1640 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
1641 stmts
, i
, &oprnds_info
);
1643 matches
[(res
== -1) ? 0 : i
] = false;
1647 for (i
= 0; i
< group_size
; ++i
)
1650 vect_free_oprnd_info (oprnds_info
);
1655 auto_vec
<slp_tree
, 4> children
;
1657 stmt_info
= stmts
[0];
1659 /* Create SLP_TREE nodes for the definition node/s. */
1660 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
1665 /* We're skipping certain operands from processing, for example
1666 outer loop reduction initial defs. */
1669 children
.safe_push (NULL
);
1673 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
1675 /* COND_EXPR have one too many eventually if the condition
1677 gcc_assert (i
== 3 && nops
== 4);
1681 if (is_a
<bb_vec_info
> (vinfo
)
1682 && oprnd_info
->first_dt
== vect_internal_def
1683 && !oprnd_info
->any_pattern
)
1685 /* For BB vectorization, if all defs are the same do not
1686 bother to continue the build along the single-lane
1687 graph but use a splat of the scalar value. */
1688 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
1689 for (j
= 1; j
< group_size
; ++j
)
1690 if (oprnd_info
->def_stmts
[j
] != first_def
)
1693 /* But avoid doing this for loads where we may be
1694 able to CSE things, unless the stmt is not
1696 && (!STMT_VINFO_VECTORIZABLE (first_def
)
1697 || !gimple_vuse (first_def
->stmt
)))
1699 if (dump_enabled_p ())
1700 dump_printf_loc (MSG_NOTE
, vect_location
,
1701 "Using a splat of the uniform operand\n");
1702 oprnd_info
->first_dt
= vect_external_def
;
1706 if (oprnd_info
->first_dt
== vect_external_def
1707 || oprnd_info
->first_dt
== vect_constant_def
)
1709 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
1710 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
1711 oprnd_info
->ops
= vNULL
;
1712 children
.safe_push (invnode
);
1716 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1717 group_size
, &this_max_nunits
,
1719 &this_tree_size
, bst_map
)) != NULL
)
1721 oprnd_info
->def_stmts
= vNULL
;
1722 children
.safe_push (child
);
1726 /* If the SLP build for operand zero failed and operand zero
1727 and one can be commutated try that for the scalar stmts
1728 that failed the match. */
1730 /* A first scalar stmt mismatch signals a fatal mismatch. */
1732 /* ??? For COND_EXPRs we can swap the comparison operands
1733 as well as the arms under some constraints. */
1735 && oprnds_info
[1]->first_dt
== vect_internal_def
1736 && is_gimple_assign (stmt_info
->stmt
)
1737 /* Swapping operands for reductions breaks assumptions later on. */
1738 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_reduction_def
1739 && STMT_VINFO_DEF_TYPE (stmt_info
) != vect_double_reduction_def
)
1741 /* See whether we can swap the matching or the non-matching
1743 bool swap_not_matching
= true;
1746 for (j
= 0; j
< group_size
; ++j
)
1748 if (matches
[j
] != !swap_not_matching
)
1750 stmt_vec_info stmt_info
= stmts
[j
];
1751 /* Verify if we can swap operands of this stmt. */
1752 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
1754 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
1756 if (!swap_not_matching
)
1758 swap_not_matching
= false;
1763 while (j
!= group_size
);
1765 /* Swap mismatched definition stmts. */
1766 if (dump_enabled_p ())
1767 dump_printf_loc (MSG_NOTE
, vect_location
,
1768 "Re-trying with swapped operands of stmts ");
1769 for (j
= 0; j
< group_size
; ++j
)
1770 if (matches
[j
] == !swap_not_matching
)
1772 std::swap (oprnds_info
[0]->def_stmts
[j
],
1773 oprnds_info
[1]->def_stmts
[j
]);
1774 std::swap (oprnds_info
[0]->ops
[j
],
1775 oprnds_info
[1]->ops
[j
]);
1776 if (dump_enabled_p ())
1777 dump_printf (MSG_NOTE
, "%d ", j
);
1779 if (dump_enabled_p ())
1780 dump_printf (MSG_NOTE
, "\n");
1781 /* And try again with scratch 'matches' ... */
1782 bool *tem
= XALLOCAVEC (bool, group_size
);
1783 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
1784 group_size
, &this_max_nunits
,
1786 &this_tree_size
, bst_map
)) != NULL
)
1788 oprnd_info
->def_stmts
= vNULL
;
1789 children
.safe_push (child
);
1795 /* If the SLP build failed and we analyze a basic-block
1796 simply treat nodes we fail to build as externally defined
1797 (and thus build vectors from the scalar defs).
1798 The cost model will reject outright expensive cases.
1799 ??? This doesn't treat cases where permutation ultimatively
1800 fails (or we don't try permutation below). Ideally we'd
1801 even compute a permutation that will end up with the maximum
1803 if (is_a
<bb_vec_info
> (vinfo
)
1804 /* ??? Rejecting patterns this way doesn't work. We'd have to
1805 do extra work to cancel the pattern so the uses see the
1807 && !is_pattern_stmt_p (stmt_info
)
1808 && !oprnd_info
->any_pattern
)
1810 /* But if there's a leading vector sized set of matching stmts
1811 fail here so we can split the group. This matches the condition
1812 vect_analyze_slp_instance uses. */
1813 /* ??? We might want to split here and combine the results to support
1814 multiple vector sizes better. */
1815 for (j
= 0; j
< group_size
; ++j
)
1818 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
)))
1820 if (dump_enabled_p ())
1821 dump_printf_loc (MSG_NOTE
, vect_location
,
1822 "Building vector operands from scalars\n");
1824 child
= vect_create_new_slp_node (oprnd_info
->ops
);
1825 children
.safe_push (child
);
1826 oprnd_info
->ops
= vNULL
;
1831 gcc_assert (child
== NULL
);
1832 FOR_EACH_VEC_ELT (children
, j
, child
)
1834 vect_free_slp_tree (child
);
1835 vect_free_oprnd_info (oprnds_info
);
1839 vect_free_oprnd_info (oprnds_info
);
1841 /* If we have all children of a child built up from uniform scalars
1842 or does more than one possibly expensive vector construction then
1843 just throw that away, causing it built up from scalars.
1844 The exception is the SLP node for the vector store. */
1845 if (is_a
<bb_vec_info
> (vinfo
)
1846 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1847 /* ??? Rejecting patterns this way doesn't work. We'd have to
1848 do extra work to cancel the pattern so the uses see the
1850 && !is_pattern_stmt_p (stmt_info
))
1854 bool all_uniform_p
= true;
1855 unsigned n_vector_builds
= 0;
1856 FOR_EACH_VEC_ELT (children
, j
, child
)
1860 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
1861 all_uniform_p
= false;
1862 else if (!vect_slp_tree_uniform_p (child
))
1864 all_uniform_p
= false;
1865 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
1869 if (all_uniform_p
|| n_vector_builds
> 1)
1873 FOR_EACH_VEC_ELT (children
, j
, child
)
1875 vect_free_slp_tree (child
);
1877 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_NOTE
, vect_location
,
1879 "Building parent vector operands from "
1880 "scalars instead\n");
1885 *tree_size
+= this_tree_size
+ 1;
1886 *max_nunits
= this_max_nunits
;
1890 /* ??? We'd likely want to either cache in bst_map sth like
1891 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1892 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1893 explicit stmts to put in so the keying on 'stmts' doesn't
1894 work (but we have the same issue with nodes that use 'ops'). */
1895 slp_tree one
= new _slp_tree
;
1896 slp_tree two
= new _slp_tree
;
1897 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
1898 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
1899 SLP_TREE_VECTYPE (one
) = vectype
;
1900 SLP_TREE_VECTYPE (two
) = vectype
;
1901 SLP_TREE_CHILDREN (one
).safe_splice (children
);
1902 SLP_TREE_CHILDREN (two
).safe_splice (children
);
1904 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
1905 SLP_TREE_REF_COUNT (child
)++;
1907 /* Here we record the original defs since this
1908 node represents the final lane configuration. */
1909 node
= vect_create_new_slp_node (node
, stmts
, 2);
1910 SLP_TREE_VECTYPE (node
) = vectype
;
1911 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
1912 SLP_TREE_CHILDREN (node
).quick_push (one
);
1913 SLP_TREE_CHILDREN (node
).quick_push (two
);
1914 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
1915 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
1916 enum tree_code ocode
= ERROR_MARK
;
1917 stmt_vec_info ostmt_info
;
1919 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
1921 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
1922 if (gimple_assign_rhs_code (ostmt
) != code0
)
1924 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
1925 ocode
= gimple_assign_rhs_code (ostmt
);
1929 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
1931 SLP_TREE_CODE (one
) = code0
;
1932 SLP_TREE_CODE (two
) = ocode
;
1933 SLP_TREE_LANES (one
) = stmts
.length ();
1934 SLP_TREE_LANES (two
) = stmts
.length ();
1935 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
1936 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
1940 node
= vect_create_new_slp_node (node
, stmts
, nops
);
1941 SLP_TREE_VECTYPE (node
) = vectype
;
1942 SLP_TREE_CHILDREN (node
).splice (children
);
1946 /* Dump a single SLP tree NODE. */
1949 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
1954 stmt_vec_info stmt_info
;
1957 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
1958 dump_user_location_t user_loc
= loc
.get_user_location ();
1959 dump_printf_loc (metadata
, user_loc
, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1960 SLP_TREE_DEF_TYPE (node
) == vect_external_def
1962 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
1965 estimated_poly_value (node
->max_nunits
),
1966 SLP_TREE_REF_COUNT (node
));
1967 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
1969 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
1970 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
1972 dump_printf_loc (metadata
, user_loc
, "op template: %G",
1973 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
1975 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
1976 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
1977 dump_printf_loc (metadata
, user_loc
, "\tstmt %u %G", i
, stmt_info
->stmt
);
1980 dump_printf_loc (metadata
, user_loc
, "\t{ ");
1981 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
1982 dump_printf (metadata
, "%T%s ", op
,
1983 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
1984 dump_printf (metadata
, "}\n");
1986 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
1988 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
1989 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
1990 dump_printf (dump_kind
, " %u", j
);
1991 dump_printf (dump_kind
, " }\n");
1993 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
1995 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
1996 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
1997 dump_printf (dump_kind
, " %u[%u]",
1998 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
1999 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
2000 dump_printf (dump_kind
, " }\n");
2002 if (SLP_TREE_CHILDREN (node
).is_empty ())
2004 dump_printf_loc (metadata
, user_loc
, "\tchildren");
2005 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2006 dump_printf (dump_kind
, " %p", (void *)child
);
2007 dump_printf (dump_kind
, "\n");
2011 debug (slp_tree node
)
2013 debug_dump_context ctx
;
2014 vect_print_slp_tree (MSG_NOTE
,
2015 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
2019 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2022 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2023 slp_tree node
, hash_set
<slp_tree
> &visited
)
2028 if (visited
.add (node
))
2031 vect_print_slp_tree (dump_kind
, loc
, node
);
2033 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2035 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
2039 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
2042 hash_set
<slp_tree
> visited
;
2043 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
2046 /* Mark the tree rooted at NODE with PURE_SLP. */
2049 vect_mark_slp_stmts (slp_tree node
, hash_set
<slp_tree
> &visited
)
2052 stmt_vec_info stmt_info
;
2055 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2058 if (visited
.add (node
))
2061 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2062 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
2064 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2066 vect_mark_slp_stmts (child
, visited
);
2070 vect_mark_slp_stmts (slp_tree node
)
2072 hash_set
<slp_tree
> visited
;
2073 vect_mark_slp_stmts (node
, visited
);
2076 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2079 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
2082 stmt_vec_info stmt_info
;
2085 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2088 if (visited
.add (node
))
2091 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
2093 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
2094 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
2095 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
2098 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2100 vect_mark_slp_stmts_relevant (child
, visited
);
2104 vect_mark_slp_stmts_relevant (slp_tree node
)
2106 hash_set
<slp_tree
> visited
;
2107 vect_mark_slp_stmts_relevant (node
, visited
);
2111 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2114 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
2115 hash_set
<slp_tree
> &visited
)
2117 if (!node
|| visited
.add (node
))
2120 if (SLP_TREE_CHILDREN (node
).length () == 0)
2122 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2124 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
2125 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2126 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2127 loads
.safe_push (node
);
2133 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2134 vect_gather_slp_loads (loads
, child
, visited
);
2139 /* Find the last store in SLP INSTANCE. */
2142 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
2144 stmt_vec_info last
= NULL
;
2145 stmt_vec_info stmt_vinfo
;
2147 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2149 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2150 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
2156 /* Find the first stmt in NODE. */
2159 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
2161 stmt_vec_info first
= NULL
;
2162 stmt_vec_info stmt_vinfo
;
2164 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
2166 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
2168 || get_later_stmt (stmt_vinfo
, first
) == first
)
2175 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2176 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2177 (also containing the first GROUP1_SIZE stmts, since stores are
2178 consecutive), the second containing the remainder.
2179 Return the first stmt in the second group. */
2181 static stmt_vec_info
2182 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
2184 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
2185 gcc_assert (group1_size
> 0);
2186 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
2187 gcc_assert (group2_size
> 0);
2188 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
2190 stmt_vec_info stmt_info
= first_vinfo
;
2191 for (unsigned i
= group1_size
; i
> 1; i
--)
2193 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2194 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2196 /* STMT is now the last element of the first group. */
2197 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
2198 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
2200 DR_GROUP_SIZE (group2
) = group2_size
;
2201 for (stmt_info
= group2
; stmt_info
;
2202 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
2204 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
2205 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
2208 /* For the second group, the DR_GROUP_GAP is that before the original group,
2209 plus skipping over the first vector. */
2210 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
2212 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2213 DR_GROUP_GAP (first_vinfo
) += group2_size
;
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
2217 group1_size
, group2_size
);
2222 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2223 statements and a vector of NUNITS elements. */
2226 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
2228 return exact_div (common_multiple (nunits
, group_size
), group_size
);
2231 /* Helper function of vect_match_slp_patterns.
2233 Attempts to match patterns against the slp tree rooted in REF_NODE using
2234 VINFO. Patterns are matched in post-order traversal.
2236 If matching is successful the value in REF_NODE is updated and returned, if
2237 not then it is returned unchanged. */
2240 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
2241 slp_tree_to_load_perm_map_t
*perm_cache
,
2242 hash_set
<slp_tree
> *visited
)
2245 slp_tree node
= *ref_node
;
2246 bool found_p
= false;
2247 if (!node
|| visited
->add (node
))
2251 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2252 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
2253 vinfo
, perm_cache
, visited
);
2255 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
2257 vect_pattern
*pattern
= slp_patterns
[x
] (perm_cache
, ref_node
);
2260 pattern
->build (vinfo
);
2269 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2272 The modified tree is returned. Patterns are tried in order and multiple
2273 patterns may match. */
2276 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
2277 hash_set
<slp_tree
> *visited
,
2278 slp_tree_to_load_perm_map_t
*perm_cache
,
2279 scalar_stmts_to_slp_tree_map_t
* /* bst_map */)
2281 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2282 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE
, vect_location
,
2286 "Analyzing SLP tree %p for patterns\n",
2287 SLP_INSTANCE_TREE (instance
));
2290 = vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, visited
);
2294 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_NOTE
, vect_location
,
2297 "Pattern matched SLP tree\n");
2298 vect_print_slp_graph (MSG_NOTE
, vect_location
, *ref_node
);
2305 /* Analyze an SLP instance starting from a group of grouped stores. Call
2306 vect_build_slp_tree to build a tree of packed stmts if possible.
2307 Return FALSE if it's impossible to SLP any stmt in the loop. */
2310 vect_analyze_slp_instance (vec_info
*vinfo
,
2311 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2312 stmt_vec_info stmt_info
, slp_instance_kind kind
,
2313 unsigned max_tree_size
, unsigned *limit
);
2315 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
2316 of KIND. Return true if successful. */
2319 vect_build_slp_instance (vec_info
*vinfo
,
2320 slp_instance_kind kind
,
2321 vec
<stmt_vec_info
> &scalar_stmts
,
2322 stmt_vec_info root_stmt_info
,
2323 unsigned max_tree_size
, unsigned *limit
,
2324 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2325 /* ??? We need stmt_info for group splitting. */
2326 stmt_vec_info stmt_info_
)
2328 if (dump_enabled_p ())
2330 dump_printf_loc (MSG_NOTE
, vect_location
,
2331 "Starting SLP discovery for\n");
2332 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
2333 dump_printf_loc (MSG_NOTE
, vect_location
,
2334 " %G", scalar_stmts
[i
]->stmt
);
2337 /* Build the tree for the SLP instance. */
2338 unsigned int group_size
= scalar_stmts
.length ();
2339 bool *matches
= XALLOCAVEC (bool, group_size
);
2340 poly_uint64 max_nunits
= 1;
2341 unsigned tree_size
= 0;
2343 slp_tree node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
2344 &max_nunits
, matches
, limit
,
2345 &tree_size
, bst_map
);
2348 /* Calculate the unrolling factor based on the smallest type. */
2349 poly_uint64 unrolling_factor
2350 = calculate_unrolling_factor (max_nunits
, group_size
);
2352 if (maybe_ne (unrolling_factor
, 1U)
2353 && is_a
<bb_vec_info
> (vinfo
))
2355 unsigned HOST_WIDE_INT const_max_nunits
;
2356 if (!max_nunits
.is_constant (&const_max_nunits
)
2357 || const_max_nunits
> group_size
)
2359 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2361 "Build SLP failed: store group "
2362 "size not a multiple of the vector size "
2363 "in basic block SLP\n");
2364 vect_free_slp_tree (node
);
2367 /* Fatal mismatch. */
2368 if (dump_enabled_p ())
2369 dump_printf_loc (MSG_NOTE
, vect_location
,
2370 "SLP discovery succeeded but node needs "
2372 memset (matches
, true, group_size
);
2373 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
2374 vect_free_slp_tree (node
);
2378 /* Create a new SLP instance. */
2379 slp_instance new_instance
= XNEW (class _slp_instance
);
2380 SLP_INSTANCE_TREE (new_instance
) = node
;
2381 SLP_INSTANCE_UNROLLING_FACTOR (new_instance
) = unrolling_factor
;
2382 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
2383 SLP_INSTANCE_ROOT_STMT (new_instance
) = root_stmt_info
;
2384 SLP_INSTANCE_KIND (new_instance
) = kind
;
2385 new_instance
->reduc_phis
= NULL
;
2386 new_instance
->cost_vec
= vNULL
;
2387 new_instance
->subgraph_entries
= vNULL
;
2389 if (dump_enabled_p ())
2390 dump_printf_loc (MSG_NOTE
, vect_location
,
2391 "SLP size %u vs. limit %u.\n",
2392 tree_size
, max_tree_size
);
2394 /* Fixup SLP reduction chains. */
2395 if (kind
== slp_inst_kind_reduc_chain
)
2397 /* If this is a reduction chain with a conversion in front
2398 amend the SLP tree with a node for that. */
2400 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
2401 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
2403 /* Get at the conversion stmt - we know it's the single use
2404 of the last stmt of the reduction chain. */
2405 use_operand_p use_p
;
2406 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
2407 &use_p
, &scalar_def
);
2409 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
2410 next_info
= vect_stmt_to_vectorize (next_info
);
2411 scalar_stmts
= vNULL
;
2412 scalar_stmts
.create (group_size
);
2413 for (unsigned i
= 0; i
< group_size
; ++i
)
2414 scalar_stmts
.quick_push (next_info
);
2415 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
2416 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
2417 SLP_TREE_CHILDREN (conv
).quick_push (node
);
2418 SLP_INSTANCE_TREE (new_instance
) = conv
;
2419 /* We also have to fake this conversion stmt as SLP reduction
2420 group so we don't have to mess with too much code
2422 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
2423 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
2425 /* Fill the backedge child of the PHI SLP node. The
2426 general matching code cannot find it because the
2427 scalar code does not reflect how we vectorize the
2429 use_operand_p use_p
;
2430 imm_use_iterator imm_iter
;
2431 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
2432 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
2433 gimple_get_lhs (scalar_def
))
2434 /* There are exactly two non-debug uses, the reduction
2435 PHI and the loop-closed PHI node. */
2436 if (!is_gimple_debug (USE_STMT (use_p
))
2437 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
2439 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
2440 stmt_vec_info phi_info
2441 = vinfo
->lookup_stmt (USE_STMT (use_p
));
2442 for (unsigned i
= 0; i
< group_size
; ++i
)
2443 phis
.quick_push (phi_info
);
2444 slp_tree
*phi_node
= bst_map
->get (phis
);
2445 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
2446 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
2447 = SLP_INSTANCE_TREE (new_instance
);
2448 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
2452 vinfo
->slp_instances
.safe_push (new_instance
);
2454 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2455 the number of scalar stmts in the root in a few places.
2456 Verify that assumption holds. */
2457 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
2458 .length () == group_size
);
2460 if (dump_enabled_p ())
2462 dump_printf_loc (MSG_NOTE
, vect_location
,
2463 "Final SLP tree for instance %p:\n", new_instance
);
2464 vect_print_slp_graph (MSG_NOTE
, vect_location
,
2465 SLP_INSTANCE_TREE (new_instance
));
2473 /* Failed to SLP. */
2474 /* Free the allocated memory. */
2475 scalar_stmts
.release ();
2478 stmt_vec_info stmt_info
= stmt_info_
;
2479 /* Try to break the group up into pieces. */
2480 if (kind
== slp_inst_kind_store
)
2482 /* ??? We could delay all the actual splitting of store-groups
2483 until after SLP discovery of the original group completed.
2484 Then we can recurse to vect_build_slp_instance directly. */
2485 for (i
= 0; i
< group_size
; i
++)
2489 /* For basic block SLP, try to break the group up into multiples of
2491 if (is_a
<bb_vec_info
> (vinfo
)
2492 && (i
> 1 && i
< group_size
))
2495 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
2496 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
2497 1 << floor_log2 (i
));
2498 unsigned HOST_WIDE_INT const_nunits
;
2500 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
2502 /* Split into two groups at the first vector boundary. */
2503 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
2504 unsigned group1_size
= i
& ~(const_nunits
- 1);
2506 if (dump_enabled_p ())
2507 dump_printf_loc (MSG_NOTE
, vect_location
,
2508 "Splitting SLP group at stmt %u\n", i
);
2509 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2511 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2512 kind
, max_tree_size
,
2514 /* Split the rest at the failure point and possibly
2515 re-analyze the remaining matching part if it has
2516 at least two lanes. */
2518 && (i
+ 1 < group_size
2519 || i
- group1_size
> 1))
2521 stmt_vec_info rest2
= rest
;
2522 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
2523 if (i
- group1_size
> 1)
2524 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
2525 kind
, max_tree_size
,
2528 /* Re-analyze the non-matching tail if it has at least
2530 if (i
+ 1 < group_size
)
2531 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2532 rest
, kind
, max_tree_size
,
2538 /* For loop vectorization split into arbitrary pieces of size > 1. */
2539 if (is_a
<loop_vec_info
> (vinfo
)
2540 && (i
> 1 && i
< group_size
))
2542 unsigned group1_size
= i
;
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_NOTE
, vect_location
,
2546 "Splitting SLP group at stmt %u\n", i
);
2548 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
2550 /* Loop vectorization cannot handle gaps in stores, make sure
2551 the split group appears as strided. */
2552 STMT_VINFO_STRIDED_P (rest
) = 1;
2553 DR_GROUP_GAP (rest
) = 0;
2554 STMT_VINFO_STRIDED_P (stmt_info
) = 1;
2555 DR_GROUP_GAP (stmt_info
) = 0;
2557 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
2558 kind
, max_tree_size
, limit
);
2559 if (i
+ 1 < group_size
)
2560 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
2561 rest
, kind
, max_tree_size
, limit
);
2566 /* Even though the first vector did not all match, we might be able to SLP
2567 (some) of the remainder. FORNOW ignore this possibility. */
2570 /* Failed to SLP. */
2571 if (dump_enabled_p ())
2572 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
2577 /* Analyze an SLP instance starting from a group of grouped stores. Call
2578 vect_build_slp_tree to build a tree of packed stmts if possible.
2579 Return FALSE if it's impossible to SLP any stmt in the loop. */
2582 vect_analyze_slp_instance (vec_info
*vinfo
,
2583 scalar_stmts_to_slp_tree_map_t
*bst_map
,
2584 stmt_vec_info stmt_info
,
2585 slp_instance_kind kind
,
2586 unsigned max_tree_size
, unsigned *limit
)
2589 vec
<stmt_vec_info
> scalar_stmts
;
2591 if (is_a
<bb_vec_info
> (vinfo
))
2592 vect_location
= stmt_info
->stmt
;
2594 stmt_vec_info next_info
= stmt_info
;
2595 if (kind
== slp_inst_kind_store
)
2597 /* Collect the stores and store them in scalar_stmts. */
2598 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
2601 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2602 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
2605 else if (kind
== slp_inst_kind_reduc_chain
)
2607 /* Collect the reduction stmts and store them in scalar_stmts. */
2608 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
2611 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
2612 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
2614 /* Mark the first element of the reduction chain as reduction to properly
2615 transform the node. In the reduction analysis phase only the last
2616 element of the chain is marked as reduction. */
2617 STMT_VINFO_DEF_TYPE (stmt_info
)
2618 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
2619 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
2620 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
2622 else if (kind
== slp_inst_kind_ctor
)
2624 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
2626 scalar_stmts
.create (CONSTRUCTOR_NELTS (rhs
));
2627 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), i
, val
)
2629 stmt_vec_info def_info
= vinfo
->lookup_def (val
);
2630 def_info
= vect_stmt_to_vectorize (def_info
);
2631 scalar_stmts
.quick_push (def_info
);
2633 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_NOTE
, vect_location
,
2635 "Analyzing vectorizable constructor: %G\n",
2638 else if (kind
== slp_inst_kind_reduc_group
)
2640 /* Collect reduction statements. */
2641 vec
<stmt_vec_info
> reductions
= as_a
<loop_vec_info
> (vinfo
)->reductions
;
2642 scalar_stmts
.create (reductions
.length ());
2643 for (i
= 0; reductions
.iterate (i
, &next_info
); i
++)
2644 if (STMT_VINFO_RELEVANT_P (next_info
)
2645 || STMT_VINFO_LIVE_P (next_info
))
2646 scalar_stmts
.quick_push (next_info
);
2647 /* If less than two were relevant/live there's nothing to SLP. */
2648 if (scalar_stmts
.length () < 2)
2654 /* Build the tree for the SLP instance. */
2655 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
2656 kind
== slp_inst_kind_ctor
2658 max_tree_size
, limit
, bst_map
,
2659 kind
== slp_inst_kind_store
2660 ? stmt_info
: NULL
);
2662 /* ??? If this is slp_inst_kind_store and the above succeeded here's
2663 where we should do store group splitting. */
2668 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2669 trees of packed scalar stmts if SLP is possible. */
2672 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
)
2675 stmt_vec_info first_element
;
2676 slp_instance instance
;
2678 DUMP_VECT_SCOPE ("vect_analyze_slp");
2680 unsigned limit
= max_tree_size
;
2682 scalar_stmts_to_slp_tree_map_t
*bst_map
2683 = new scalar_stmts_to_slp_tree_map_t ();
2685 /* Find SLP sequences starting from groups of grouped stores. */
2686 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
2687 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2688 STMT_VINFO_GROUPED_ACCESS (first_element
)
2689 ? slp_inst_kind_store
: slp_inst_kind_ctor
,
2690 max_tree_size
, &limit
);
2692 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
2694 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
2696 vect_location
= bb_vinfo
->roots
[i
].root
->stmt
;
2697 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
2698 bb_vinfo
->roots
[i
].stmts
,
2699 bb_vinfo
->roots
[i
].root
,
2700 max_tree_size
, &limit
, bst_map
, NULL
))
2701 bb_vinfo
->roots
[i
].stmts
= vNULL
;
2705 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
2707 /* Find SLP sequences starting from reduction chains. */
2708 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
2709 if (! STMT_VINFO_RELEVANT_P (first_element
)
2710 && ! STMT_VINFO_LIVE_P (first_element
))
2712 else if (! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
2713 slp_inst_kind_reduc_chain
,
2714 max_tree_size
, &limit
))
2716 /* Dissolve reduction chain group. */
2717 stmt_vec_info vinfo
= first_element
;
2718 stmt_vec_info last
= NULL
;
2721 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
2722 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
2723 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
2727 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
2728 /* It can be still vectorized as part of an SLP reduction. */
2729 loop_vinfo
->reductions
.safe_push (last
);
2732 /* Find SLP sequences starting from groups of reductions. */
2733 if (loop_vinfo
->reductions
.length () > 1)
2734 vect_analyze_slp_instance (vinfo
, bst_map
, loop_vinfo
->reductions
[0],
2735 slp_inst_kind_reduc_group
, max_tree_size
,
2739 hash_set
<slp_tree
> visited_patterns
;
2740 slp_tree_to_load_perm_map_t perm_cache
;
2741 /* See if any patterns can be found in the SLP tree. */
2742 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
2743 vect_match_slp_patterns (instance
, vinfo
, &visited_patterns
, &perm_cache
,
2746 /* The map keeps a reference on SLP nodes built, release that. */
2747 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
2748 it
!= bst_map
->end (); ++it
)
2750 vect_free_slp_tree ((*it
).second
);
2753 return opt_result::success ();
2756 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2759 vect_slp_build_vertices (hash_set
<slp_tree
> &visited
, slp_tree node
,
2760 vec
<slp_tree
> &vertices
, vec
<int> &leafs
)
2765 if (visited
.add (node
))
2768 node
->vertex
= vertices
.length ();
2769 vertices
.safe_push (node
);
2772 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
2776 vect_slp_build_vertices (visited
, child
, vertices
, leafs
);
2779 leafs
.safe_push (node
->vertex
);
2782 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
2785 vect_slp_build_vertices (vec_info
*info
, vec
<slp_tree
> &vertices
,
2788 hash_set
<slp_tree
> visited
;
2790 slp_instance instance
;
2791 FOR_EACH_VEC_ELT (info
->slp_instances
, i
, instance
)
2793 unsigned n_v
= vertices
.length ();
2794 unsigned n_l
= leafs
.length ();
2795 vect_slp_build_vertices (visited
, SLP_INSTANCE_TREE (instance
), vertices
,
2797 /* If we added vertices but no entries to the reverse graph we've
2798 added a cycle that is not backwards-reachable. Push the entry
2799 to mimic as leaf then. */
2800 if (vertices
.length () > n_v
2801 && leafs
.length () == n_l
)
2802 leafs
.safe_push (SLP_INSTANCE_TREE (instance
)->vertex
);
2806 /* Apply (reverse) bijectite PERM to VEC. */
2810 vect_slp_permute (vec
<unsigned> perm
,
2811 vec
<T
> &vec
, bool reverse
)
2813 auto_vec
<T
, 64> saved
;
2814 saved
.create (vec
.length ());
2815 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2816 saved
.quick_push (vec
[i
]);
2820 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2821 vec
[perm
[i
]] = saved
[i
];
2822 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2823 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
2827 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2828 vec
[i
] = saved
[perm
[i
]];
2829 for (unsigned i
= 0; i
< vec
.length (); ++i
)
2830 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
2834 /* Return whether permutations PERM_A and PERM_B as recorded in the
2835 PERMS vector are equal. */
2838 vect_slp_perms_eq (const vec
<vec
<unsigned> > &perms
,
2839 int perm_a
, int perm_b
)
2841 return (perm_a
== perm_b
2842 || (perms
[perm_a
].length () == perms
[perm_b
].length ()
2843 && memcmp (&perms
[perm_a
][0], &perms
[perm_b
][0],
2844 sizeof (unsigned) * perms
[perm_a
].length ()) == 0));
2847 /* Optimize the SLP graph of VINFO. */
2850 vect_optimize_slp (vec_info
*vinfo
)
2852 if (vinfo
->slp_instances
.is_empty ())
2857 auto_vec
<slp_tree
> vertices
;
2858 auto_vec
<int> leafs
;
2859 vect_slp_build_vertices (vinfo
, vertices
, leafs
);
2861 struct graph
*slpg
= new_graph (vertices
.length ());
2862 FOR_EACH_VEC_ELT (vertices
, i
, node
)
2866 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
2868 add_edge (slpg
, i
, child
->vertex
);
2871 /* Compute (reverse) postorder on the inverted graph. */
2873 graphds_dfs (slpg
, &leafs
[0], leafs
.length (), &ipo
, false, NULL
, NULL
);
2875 auto_sbitmap
n_visited (vertices
.length ());
2876 auto_sbitmap
n_materialize (vertices
.length ());
2877 auto_vec
<int> n_perm (vertices
.length ());
2878 auto_vec
<vec
<unsigned> > perms
;
2880 bitmap_clear (n_visited
);
2881 bitmap_clear (n_materialize
);
2882 n_perm
.quick_grow_cleared (vertices
.length ());
2883 perms
.safe_push (vNULL
); /* zero is no permute */
2885 /* Produce initial permutations. */
2886 for (i
= 0; i
< leafs
.length (); ++i
)
2889 slp_tree node
= vertices
[idx
];
2891 /* Handle externals and constants optimistically throughout the
2893 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
2894 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
2897 /* Leafs do not change across iterations. Note leafs also double
2898 as entries to the reverse graph. */
2899 if (!slpg
->vertices
[idx
].succ
)
2900 bitmap_set_bit (n_visited
, idx
);
2901 /* Loads are the only thing generating permutes. */
2902 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
2905 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
2906 node unpermuted, record this permute. */
2907 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
2908 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
2910 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
2911 unsigned imin
= DR_GROUP_SIZE (dr_stmt
) + 1, imax
= 0;
2912 bool any_permute
= false;
2913 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2915 unsigned idx
= SLP_TREE_LOAD_PERMUTATION (node
)[j
];
2916 imin
= MIN (imin
, idx
);
2917 imax
= MAX (imax
, idx
);
2918 if (idx
- SLP_TREE_LOAD_PERMUTATION (node
)[0] != j
)
2921 /* If there's no permute no need to split one out. */
2924 /* If the span doesn't match we'd disrupt VF computation, avoid
2926 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
2929 /* For now only handle true permutes, like
2930 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
2931 when permuting constants and invariants keeping the permute
2933 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
2934 bitmap_clear (load_index
);
2935 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2936 bitmap_set_bit (load_index
, SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
);
2938 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2939 if (!bitmap_bit_p (load_index
, j
))
2941 if (j
!= SLP_TREE_LANES (node
))
2944 vec
<unsigned> perm
= vNULL
;
2945 perm
.safe_grow (SLP_TREE_LANES (node
), true);
2946 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
2947 perm
[j
] = SLP_TREE_LOAD_PERMUTATION (node
)[j
] - imin
;
2948 perms
.safe_push (perm
);
2949 n_perm
[idx
] = perms
.length () - 1;
2952 /* Propagate permutes along the graph and compute materialization points. */
2954 unsigned iteration
= 0;
2960 for (i
= vertices
.length (); i
> 0 ; --i
)
2963 slp_tree node
= vertices
[idx
];
2964 /* For leafs there's nothing to do - we've seeded permutes
2966 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
2969 bitmap_set_bit (n_visited
, idx
);
2971 /* We cannot move a permute across a store. */
2972 if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))
2974 (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node
))))
2978 for (graph_edge
*succ
= slpg
->vertices
[idx
].succ
;
2979 succ
; succ
= succ
->succ_next
)
2981 int succ_idx
= succ
->dest
;
2982 /* Handle unvisited nodes optimistically. */
2983 /* ??? But for constants once we want to handle non-bijective
2984 permutes we have to verify the permute, when unifying lanes,
2985 will not unify different constants. For example see
2986 gcc.dg/vect/bb-slp-14.c for a case that would break. */
2987 if (!bitmap_bit_p (n_visited
, succ_idx
))
2989 int succ_perm
= n_perm
[succ_idx
];
2990 /* Once we materialize succs permutation its output lanes
2991 appear unpermuted to us. */
2992 if (bitmap_bit_p (n_materialize
, succ_idx
))
2996 else if (succ_perm
== 0)
3001 else if (!vect_slp_perms_eq (perms
, perm
, succ_perm
))
3009 /* Pick up pre-computed leaf values. */
3011 else if (!vect_slp_perms_eq (perms
, perm
, n_perm
[idx
]))
3014 /* Make sure we eventually converge. */
3015 gcc_checking_assert (perm
== 0);
3018 bitmap_clear_bit (n_materialize
, idx
);
3025 /* Elide pruning at materialization points in the first
3026 iteration so every node was visited once at least. */
3030 /* Decide on permute materialization. Look whether there's
3031 a use (pred) edge that is permuted differently than us.
3032 In that case mark ourselves so the permutation is applied. */
3033 bool all_preds_permuted
= slpg
->vertices
[idx
].pred
!= NULL
;
3034 for (graph_edge
*pred
= slpg
->vertices
[idx
].pred
;
3035 pred
; pred
= pred
->pred_next
)
3037 gcc_checking_assert (bitmap_bit_p (n_visited
, pred
->src
));
3038 int pred_perm
= n_perm
[pred
->src
];
3039 if (!vect_slp_perms_eq (perms
, perm
, pred_perm
))
3041 all_preds_permuted
= false;
3045 if (!all_preds_permuted
)
3047 if (!bitmap_bit_p (n_materialize
, idx
))
3049 bitmap_set_bit (n_materialize
, idx
);
3053 while (changed
|| iteration
== 1);
3056 for (i
= 0; i
< vertices
.length (); ++i
)
3058 int perm
= n_perm
[i
];
3062 slp_tree node
= vertices
[i
];
3064 /* First permute invariant/external original successors. */
3067 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3069 if (!child
|| SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3072 /* If the vector is uniform there's nothing to do. */
3073 if (vect_slp_tree_uniform_p (child
))
3076 /* We can end up sharing some externals via two_operator
3077 handling. Be prepared to unshare those. */
3078 if (child
->refcnt
!= 1)
3080 gcc_assert (slpg
->vertices
[child
->vertex
].pred
->pred_next
);
3081 SLP_TREE_CHILDREN (node
)[j
] = child
3082 = vect_create_new_slp_node
3083 (SLP_TREE_SCALAR_OPS (child
).copy ());
3085 vect_slp_permute (perms
[perm
],
3086 SLP_TREE_SCALAR_OPS (child
), true);
3089 if (bitmap_bit_p (n_materialize
, i
))
3091 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3092 /* For loads simply drop the permutation, the load permutation
3093 already performs the desired permutation. */
3095 else if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
3097 /* If the node if already a permute node we just need to apply
3098 the permutation to the permute node itself. */
3099 if (dump_enabled_p ())
3100 dump_printf_loc (MSG_NOTE
, vect_location
,
3101 "simplifying permute node %p\n",
3104 vect_slp_permute (perms
[perm
], SLP_TREE_LANE_PERMUTATION (node
),
3109 if (dump_enabled_p ())
3110 dump_printf_loc (MSG_NOTE
, vect_location
,
3111 "inserting permute node in place of %p\n",
3114 /* Make a copy of NODE and in-place change it to a
3115 VEC_PERM node to permute the lanes of the copy. */
3116 slp_tree copy
= new _slp_tree
;
3117 SLP_TREE_CHILDREN (copy
) = SLP_TREE_CHILDREN (node
);
3118 SLP_TREE_CHILDREN (node
) = vNULL
;
3119 SLP_TREE_SCALAR_STMTS (copy
)
3120 = SLP_TREE_SCALAR_STMTS (node
).copy ();
3121 vect_slp_permute (perms
[perm
],
3122 SLP_TREE_SCALAR_STMTS (copy
), true);
3123 gcc_assert (!SLP_TREE_SCALAR_OPS (node
).exists ());
3124 SLP_TREE_REPRESENTATIVE (copy
) = SLP_TREE_REPRESENTATIVE (node
);
3125 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node
).exists ());
3126 SLP_TREE_LANE_PERMUTATION (copy
)
3127 = SLP_TREE_LANE_PERMUTATION (node
);
3128 SLP_TREE_LANE_PERMUTATION (node
) = vNULL
;
3129 SLP_TREE_VECTYPE (copy
) = SLP_TREE_VECTYPE (node
);
3131 copy
->max_nunits
= node
->max_nunits
;
3132 SLP_TREE_DEF_TYPE (copy
) = SLP_TREE_DEF_TYPE (node
);
3133 SLP_TREE_LANES (copy
) = SLP_TREE_LANES (node
);
3134 SLP_TREE_CODE (copy
) = SLP_TREE_CODE (node
);
3136 /* Now turn NODE into a VEC_PERM. */
3137 SLP_TREE_CHILDREN (node
).safe_push (copy
);
3138 SLP_TREE_LANE_PERMUTATION (node
).create (SLP_TREE_LANES (node
));
3139 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
3140 SLP_TREE_LANE_PERMUTATION (node
)
3141 .quick_push (std::make_pair (0, perms
[perm
][j
]));
3142 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3147 /* Apply the reverse permutation to our stmts. */
3148 vect_slp_permute (perms
[perm
],
3149 SLP_TREE_SCALAR_STMTS (node
), true);
3150 /* And to the load permutation, which we can simply
3151 make regular by design. */
3152 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3154 /* ??? When we handle non-bijective permutes the idea
3155 is that we can force the load-permutation to be
3156 { min, min + 1, min + 2, ... max }. But then the
3157 scalar defs might no longer match the lane content
3158 which means wrong-code with live lane vectorization.
3159 So we possibly have to have NULL entries for those. */
3160 vect_slp_permute (perms
[perm
],
3161 SLP_TREE_LOAD_PERMUTATION (node
), true);
3166 /* Free the perms vector used for propagation. */
3167 while (!perms
.is_empty ())
3168 perms
.pop ().release ();
3172 /* Now elide load permutations that are not necessary. */
3173 for (i
= 0; i
< leafs
.length (); ++i
)
3175 node
= vertices
[leafs
[i
]];
3176 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3179 /* In basic block vectorization we allow any subchain of an interleaving
3181 FORNOW: not in loop SLP because of realignment complications. */
3182 if (is_a
<bb_vec_info
> (vinfo
))
3184 bool subchain_p
= true;
3185 stmt_vec_info next_load_info
= NULL
;
3186 stmt_vec_info load_info
;
3188 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3191 && (next_load_info
!= load_info
3192 || DR_GROUP_GAP (load_info
) != 1))
3197 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
3201 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3207 stmt_vec_info load_info
;
3208 bool this_load_permuted
= false;
3210 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
3211 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
3213 this_load_permuted
= true;
3216 stmt_vec_info first_stmt_info
3217 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
3218 if (!this_load_permuted
3219 /* The load requires permutation when unrolling exposes
3220 a gap either because the group is larger than the SLP
3221 group-size or because there is a gap between the groups. */
3222 && (known_eq (LOOP_VINFO_VECT_FACTOR
3223 (as_a
<loop_vec_info
> (vinfo
)), 1U)
3224 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
3225 && DR_GROUP_GAP (first_stmt_info
) == 0)))
3227 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3234 /* Gather loads reachable from the individual SLP graph entries. */
3237 vect_gather_slp_loads (vec_info
*vinfo
)
3240 slp_instance instance
;
3241 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
3243 hash_set
<slp_tree
> visited
;
3244 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
3245 SLP_INSTANCE_TREE (instance
), visited
);
3250 /* For each possible SLP instance decide whether to SLP it and calculate overall
3251 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
3252 least one instance. */
3255 vect_make_slp_decision (loop_vec_info loop_vinfo
)
3258 poly_uint64 unrolling_factor
= 1;
3259 vec
<slp_instance
> slp_instances
= LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
3260 slp_instance instance
;
3261 int decided_to_slp
= 0;
3263 DUMP_VECT_SCOPE ("vect_make_slp_decision");
3265 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
3267 /* FORNOW: SLP if you can. */
3268 /* All unroll factors have the form:
3270 GET_MODE_SIZE (vinfo->vector_mode) * X
3272 for some rational X, so they must have a common multiple. */
3274 = force_common_multiple (unrolling_factor
,
3275 SLP_INSTANCE_UNROLLING_FACTOR (instance
));
3277 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
3278 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
3279 loop-based vectorization. Such stmts will be marked as HYBRID. */
3280 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
3284 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
3286 if (decided_to_slp
&& dump_enabled_p ())
3288 dump_printf_loc (MSG_NOTE
, vect_location
,
3289 "Decided to SLP %d instances. Unrolling factor ",
3291 dump_dec (MSG_NOTE
, unrolling_factor
);
3292 dump_printf (MSG_NOTE
, "\n");
3295 return (decided_to_slp
> 0);
3298 /* Private data for vect_detect_hybrid_slp. */
3301 loop_vec_info loop_vinfo
;
3302 vec
<stmt_vec_info
> *worklist
;
3305 /* Walker for walk_gimple_op. */
3308 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
3310 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
3311 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
3316 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
3319 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
3320 if (PURE_SLP_STMT (def_stmt_info
))
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
3324 def_stmt_info
->stmt
);
3325 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
3326 dat
->worklist
->safe_push (def_stmt_info
);
3332 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
3333 if so, otherwise pushing it to WORKLIST. */
3336 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
3337 vec
<stmt_vec_info
> &worklist
,
3338 stmt_vec_info stmt_info
)
3340 if (dump_enabled_p ())
3341 dump_printf_loc (MSG_NOTE
, vect_location
,
3342 "Processing hybrid candidate : %G", stmt_info
->stmt
);
3343 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
3344 imm_use_iterator iter2
;
3346 use_operand_p use_p
;
3347 def_operand_p def_p
;
3348 bool any_def
= false;
3349 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
3352 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
3354 if (is_gimple_debug (USE_STMT (use_p
)))
3356 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
3357 /* An out-of loop use means this is a loop_vect sink. */
3360 if (dump_enabled_p ())
3361 dump_printf_loc (MSG_NOTE
, vect_location
,
3362 "Found loop_vect sink: %G", stmt_info
->stmt
);
3363 worklist
.safe_push (stmt_info
);
3366 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
3368 if (dump_enabled_p ())
3369 dump_printf_loc (MSG_NOTE
, vect_location
,
3370 "Found loop_vect use: %G", use_info
->stmt
);
3371 worklist
.safe_push (stmt_info
);
3376 /* No def means this is a loo_vect sink. */
3379 if (dump_enabled_p ())
3380 dump_printf_loc (MSG_NOTE
, vect_location
,
3381 "Found loop_vect sink: %G", stmt_info
->stmt
);
3382 worklist
.safe_push (stmt_info
);
3385 if (dump_enabled_p ())
3386 dump_printf_loc (MSG_NOTE
, vect_location
,
3387 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
3388 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
3391 /* Find stmts that must be both vectorized and SLPed. */
3394 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
3396 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
3398 /* All stmts participating in SLP are marked pure_slp, all other
3399 stmts are loop_vect.
3400 First collect all loop_vect stmts into a worklist.
3401 SLP patterns cause not all original scalar stmts to appear in
3402 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
3403 Rectify this here and do a backward walk over the IL only considering
3404 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
3405 mark them as pure_slp. */
3406 auto_vec
<stmt_vec_info
> worklist
;
3407 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
3409 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
3410 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
3413 gphi
*phi
= gsi
.phi ();
3414 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
3415 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3416 maybe_push_to_hybrid_worklist (loop_vinfo
,
3417 worklist
, stmt_info
);
3419 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
3422 gimple
*stmt
= gsi_stmt (gsi
);
3423 if (is_gimple_debug (stmt
))
3425 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
3426 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
3428 for (gimple_stmt_iterator gsi2
3429 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
3430 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
3432 stmt_vec_info patt_info
3433 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
3434 if (!STMT_SLP_TYPE (patt_info
)
3435 && STMT_VINFO_RELEVANT (patt_info
))
3436 maybe_push_to_hybrid_worklist (loop_vinfo
,
3437 worklist
, patt_info
);
3439 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
3441 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
3442 maybe_push_to_hybrid_worklist (loop_vinfo
,
3443 worklist
, stmt_info
);
3447 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
3448 mark any SLP vectorized stmt as hybrid.
3449 ??? We're visiting def stmts N times (once for each non-SLP and
3450 once for each hybrid-SLP use). */
3453 dat
.worklist
= &worklist
;
3454 dat
.loop_vinfo
= loop_vinfo
;
3455 memset (&wi
, 0, sizeof (wi
));
3456 wi
.info
= (void *)&dat
;
3457 while (!worklist
.is_empty ())
3459 stmt_vec_info stmt_info
= worklist
.pop ();
3460 /* Since SSA operands are not set up for pattern stmts we need
3461 to use walk_gimple_op. */
3463 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
3468 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
3470 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
3471 : vec_info (vec_info::bb
, init_cost (NULL
), shared
), bbs (_bbs
), roots (vNULL
)
3473 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3476 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3479 gphi
*phi
= si
.phi ();
3480 gimple_set_uid (phi
, 0);
3483 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3484 !gsi_end_p (gsi
); gsi_next (&gsi
))
3486 gimple
*stmt
= gsi_stmt (gsi
);
3487 gimple_set_uid (stmt
, 0);
3488 if (is_gimple_debug (stmt
))
3496 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
3497 stmts in the basic block. */
3499 _bb_vec_info::~_bb_vec_info ()
3501 /* Reset region marker. */
3502 for (unsigned i
= 0; i
< bbs
.length (); ++i
)
3505 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
3508 gphi
*phi
= si
.phi ();
3509 gimple_set_uid (phi
, -1);
3511 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3512 !gsi_end_p (gsi
); gsi_next (&gsi
))
3514 gimple
*stmt
= gsi_stmt (gsi
);
3515 gimple_set_uid (stmt
, -1);
3519 for (unsigned i
= 0; i
< roots
.length (); ++i
)
3520 roots
[i
].stmts
.release ();
3524 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
3525 given then that child nodes have already been processed, and that
3526 their def types currently match their SLP node's def type. */
3529 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
3530 slp_instance node_instance
,
3531 stmt_vector_for_cost
*cost_vec
)
3533 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3534 gcc_assert (STMT_SLP_TYPE (stmt_info
) != loop_vect
);
3536 /* Calculate the number of vector statements to be created for the
3537 scalar stmts in this node. For SLP reductions it is equal to the
3538 number of vector statements in the children (which has already been
3539 calculated by the recursive call). Otherwise it is the number of
3540 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
3541 VF divided by the number of elements in a vector. */
3542 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
)
3543 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
))
3545 for (unsigned i
= 0; i
< SLP_TREE_CHILDREN (node
).length (); ++i
)
3546 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node
)[i
]) == vect_internal_def
)
3548 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3549 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node
)[i
]);
3556 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3557 vf
= loop_vinfo
->vectorization_factor
;
3560 unsigned int group_size
= SLP_TREE_LANES (node
);
3561 tree vectype
= SLP_TREE_VECTYPE (node
);
3562 SLP_TREE_NUMBER_OF_VEC_STMTS (node
)
3563 = vect_get_num_vectors (vf
* group_size
, vectype
);
3566 /* Handle purely internal nodes. */
3567 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3568 return vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
);
3570 if (is_a
<bb_vec_info
> (vinfo
)
3571 && !vect_update_shared_vectype (stmt_info
, SLP_TREE_VECTYPE (node
)))
3573 if (dump_enabled_p ())
3574 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3575 "desired vector type conflicts with earlier one "
3576 "for %G", stmt_info
->stmt
);
3581 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
3582 node
, node_instance
, cost_vec
);
3585 /* Try to build NODE from scalars, returning true on success.
3586 NODE_INSTANCE is the SLP instance that contains NODE. */
3589 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
3590 slp_instance node_instance
)
3592 stmt_vec_info stmt_info
;
3595 if (!is_a
<bb_vec_info
> (vinfo
)
3596 || node
== SLP_INSTANCE_TREE (node_instance
)
3597 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
3598 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
)))
3601 if (dump_enabled_p ())
3602 dump_printf_loc (MSG_NOTE
, vect_location
,
3603 "Building vector operands of %p from scalars instead\n", node
);
3605 /* Don't remove and free the child nodes here, since they could be
3606 referenced by other structures. The analysis and scheduling phases
3607 (need to) ignore child nodes of anything that isn't vect_internal_def. */
3608 unsigned int group_size
= SLP_TREE_LANES (node
);
3609 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
3610 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
3611 SLP_TREE_LOAD_PERMUTATION (node
).release ();
3612 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3614 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
3615 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
3620 /* Compute the prologue cost for invariant or constant operands represented
3624 vect_prologue_cost_for_slp (slp_tree node
,
3625 stmt_vector_for_cost
*cost_vec
)
3627 /* There's a special case of an existing vector, that costs nothing. */
3628 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
3629 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
3631 /* Without looking at the actual initializer a vector of
3632 constants can be implemented as load from the constant pool.
3633 When all elements are the same we can use a splat. */
3634 tree vectype
= SLP_TREE_VECTYPE (node
);
3635 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
3636 unsigned num_vects_to_check
;
3637 unsigned HOST_WIDE_INT const_nunits
;
3638 unsigned nelt_limit
;
3639 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
3640 && ! multiple_p (const_nunits
, group_size
))
3642 num_vects_to_check
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
3643 nelt_limit
= const_nunits
;
3647 /* If either the vector has variable length or the vectors
3648 are composed of repeated whole groups we only need to
3649 cost construction once. All vectors will be the same. */
3650 num_vects_to_check
= 1;
3651 nelt_limit
= group_size
;
3653 tree elt
= NULL_TREE
;
3655 for (unsigned j
= 0; j
< num_vects_to_check
* nelt_limit
; ++j
)
3657 unsigned si
= j
% group_size
;
3659 elt
= SLP_TREE_SCALAR_OPS (node
)[si
];
3660 /* ??? We're just tracking whether all operands of a single
3661 vector initializer are the same, ideally we'd check if
3662 we emitted the same one already. */
3663 else if (elt
!= SLP_TREE_SCALAR_OPS (node
)[si
])
3666 if (nelt
== nelt_limit
)
3668 record_stmt_cost (cost_vec
, 1,
3669 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3670 ? (elt
? scalar_to_vec
: vec_construct
)
3672 NULL
, vectype
, 0, vect_prologue
);
3678 /* Analyze statements contained in SLP tree NODE after recursively analyzing
3679 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
3681 Return true if the operations are supported. */
3684 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
3685 slp_instance node_instance
,
3686 hash_set
<slp_tree
> &visited_set
,
3687 vec
<slp_tree
> &visited_vec
,
3688 stmt_vector_for_cost
*cost_vec
)
3693 /* Assume we can code-generate all invariants. */
3695 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3696 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
3699 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
3701 if (dump_enabled_p ())
3702 dump_printf_loc (MSG_NOTE
, vect_location
,
3703 "Failed cyclic SLP reference in %p", node
);
3706 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
3708 /* If we already analyzed the exact same set of scalar stmts we're done.
3709 We share the generated vector stmts for those. */
3710 if (visited_set
.add (node
))
3712 visited_vec
.safe_push (node
);
3715 unsigned visited_rec_start
= visited_vec
.length ();
3716 unsigned cost_vec_rec_start
= cost_vec
->length ();
3717 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3719 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
3720 visited_set
, visited_vec
,
3727 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
3729 /* If analysis failed we have to pop all recursive visited nodes
3733 while (visited_vec
.length () >= visited_rec_start
)
3734 visited_set
.remove (visited_vec
.pop ());
3735 cost_vec
->truncate (cost_vec_rec_start
);
3738 /* When the node can be vectorized cost invariant nodes it references.
3739 This is not done in DFS order to allow the refering node
3740 vectorizable_* calls to nail down the invariant nodes vector type
3741 and possibly unshare it if it needs a different vector type than
3744 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
3746 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
3747 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
3748 /* Perform usual caching, note code-generation still
3749 code-gens these nodes multiple times but we expect
3750 to CSE them later. */
3751 && !visited_set
.add (child
))
3753 visited_vec
.safe_push (child
);
3754 /* ??? After auditing more code paths make a "default"
3755 and push the vector type from NODE to all children
3756 if it is not already set. */
3757 /* Compute the number of vectors to be generated. */
3758 tree vector_type
= SLP_TREE_VECTYPE (child
);
3761 /* For shifts with a scalar argument we don't need
3762 to cost or code-generate anything.
3763 ??? Represent this more explicitely. */
3764 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
3765 == shift_vec_info_type
)
3769 unsigned group_size
= SLP_TREE_LANES (child
);
3771 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
3772 vf
= loop_vinfo
->vectorization_factor
;
3773 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
3774 = vect_get_num_vectors (vf
* group_size
, vector_type
);
3775 /* And cost them. */
3776 vect_prologue_cost_for_slp (child
, cost_vec
);
3779 /* If this node or any of its children can't be vectorized, try pruning
3780 the tree here rather than felling the whole thing. */
3781 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
3783 /* We'll need to revisit this for invariant costing and number
3784 of vectorized stmt setting. */
3792 /* Mark lanes of NODE that are live outside of the basic-block vectorized
3793 region and that can be vectorized using vectorizable_live_operation
3794 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
3795 scalar code computing it to be retained. */
3798 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
3799 slp_instance instance
,
3800 stmt_vector_for_cost
*cost_vec
,
3801 hash_set
<stmt_vec_info
> &svisited
,
3802 hash_set
<slp_tree
> &visited
)
3804 if (visited
.add (node
))
3808 stmt_vec_info stmt_info
;
3809 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
3810 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3812 if (svisited
.contains (stmt_info
))
3814 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
3815 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
3816 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
3817 /* Only the pattern root stmt computes the original scalar value. */
3819 bool mark_visited
= true;
3820 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
3821 ssa_op_iter op_iter
;
3822 def_operand_p def_p
;
3823 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
3825 imm_use_iterator use_iter
;
3827 stmt_vec_info use_stmt_info
;
3828 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3829 if (!is_gimple_debug (use_stmt
))
3831 use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
3833 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3835 STMT_VINFO_LIVE_P (stmt_info
) = true;
3836 if (vectorizable_live_operation (bb_vinfo
, stmt_info
,
3837 NULL
, node
, instance
, i
,
3839 /* ??? So we know we can vectorize the live stmt
3840 from one SLP node. If we cannot do so from all
3841 or none consistently we'd have to record which
3842 SLP node (and lane) we want to use for the live
3843 operation. So make sure we can code-generate
3845 mark_visited
= false;
3847 STMT_VINFO_LIVE_P (stmt_info
) = false;
3848 BREAK_FROM_IMM_USE_STMT (use_iter
);
3851 /* We have to verify whether we can insert the lane extract
3852 before all uses. The following is a conservative approximation.
3853 We cannot put this into vectorizable_live_operation because
3854 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
3856 Note that while the fact that we emit code for loads at the
3857 first load should make this a non-problem leafs we construct
3858 from scalars are vectorized after the last scalar def.
3859 ??? If we'd actually compute the insert location during
3860 analysis we could use sth less conservative than the last
3861 scalar stmt in the node for the dominance check. */
3862 /* ??? What remains is "live" uses in vector CTORs in the same
3863 SLP graph which is where those uses can end up code-generated
3864 right after their definition instead of close to their original
3865 use. But that would restrict us to code-generate lane-extracts
3866 from the latest stmt in a node. So we compensate for this
3867 during code-generation, simply not replacing uses for those
3868 hopefully rare cases. */
3869 if (STMT_VINFO_LIVE_P (stmt_info
))
3870 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
3871 if (!is_gimple_debug (use_stmt
)
3872 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
3873 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
3874 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
3876 if (dump_enabled_p ())
3877 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3878 "Cannot determine insertion place for "
3880 STMT_VINFO_LIVE_P (stmt_info
) = false;
3881 mark_visited
= true;
3885 svisited
.add (stmt_info
);
3889 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3890 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
3891 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
,
3892 cost_vec
, svisited
, visited
);
3895 /* Analyze statements in SLP instances of VINFO. Return true if the
3896 operations are supported. */
3899 vect_slp_analyze_operations (vec_info
*vinfo
)
3901 slp_instance instance
;
3904 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
3906 hash_set
<slp_tree
> visited
;
3907 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
3909 auto_vec
<slp_tree
> visited_vec
;
3910 stmt_vector_for_cost cost_vec
;
3911 cost_vec
.create (2);
3912 if (is_a
<bb_vec_info
> (vinfo
))
3913 vect_location
= instance
->location ();
3914 if (!vect_slp_analyze_node_operations (vinfo
,
3915 SLP_INSTANCE_TREE (instance
),
3916 instance
, visited
, visited_vec
,
3918 /* Instances with a root stmt require vectorized defs for the
3920 || (SLP_INSTANCE_ROOT_STMT (instance
)
3921 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
3922 != vect_internal_def
)))
3924 slp_tree node
= SLP_INSTANCE_TREE (instance
);
3925 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
3926 if (dump_enabled_p ())
3927 dump_printf_loc (MSG_NOTE
, vect_location
,
3928 "removing SLP instance operations starting from: %G",
3930 vect_free_slp_instance (instance
);
3931 vinfo
->slp_instances
.ordered_remove (i
);
3932 cost_vec
.release ();
3933 while (!visited_vec
.is_empty ())
3934 visited
.remove (visited_vec
.pop ());
3940 /* For BB vectorization remember the SLP graph entry
3942 if (is_a
<bb_vec_info
> (vinfo
))
3943 instance
->cost_vec
= cost_vec
;
3946 add_stmt_costs (vinfo
, vinfo
->target_cost_data
, &cost_vec
);
3947 cost_vec
.release ();
3952 /* Compute vectorizable live stmts. */
3953 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
3955 hash_set
<stmt_vec_info
> svisited
;
3956 hash_set
<slp_tree
> visited
;
3957 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
3959 vect_location
= instance
->location ();
3960 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
3961 instance
, &instance
->cost_vec
, svisited
,
3966 return !vinfo
->slp_instances
.is_empty ();
3969 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
3970 closing the eventual chain. */
3973 get_ultimate_leader (slp_instance instance
,
3974 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
3976 auto_vec
<slp_instance
*, 8> chain
;
3978 while (*(tem
= instance_leader
.get (instance
)) != instance
)
3980 chain
.safe_push (tem
);
3983 while (!chain
.is_empty ())
3984 *chain
.pop () = instance
;
3988 /* Worker of vect_bb_partition_graph, recurse on NODE. */
3991 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
3992 slp_instance instance
, slp_tree node
,
3993 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
3994 hash_map
<slp_instance
, slp_instance
> &instance_leader
,
3995 hash_set
<slp_tree
> &visited
)
3997 stmt_vec_info stmt_info
;
4000 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4003 slp_instance
&stmt_instance
4004 = stmt_to_instance
.get_or_insert (stmt_info
, &existed_p
);
4007 else if (stmt_instance
!= instance
)
4009 /* If we're running into a previously marked stmt make us the
4010 leader of the current ultimate leader. This keeps the
4011 leader chain acyclic and works even when the current instance
4012 connects two previously independent graph parts. */
4013 slp_instance stmt_leader
4014 = get_ultimate_leader (stmt_instance
, instance_leader
);
4015 if (stmt_leader
!= instance
)
4016 instance_leader
.put (stmt_leader
, instance
);
4018 stmt_instance
= instance
;
4021 if (visited
.add (node
))
4025 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4026 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4027 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
4028 instance_leader
, visited
);
4031 /* Partition the SLP graph into pieces that can be costed independently. */
4034 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
4036 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
4038 /* First walk the SLP graph assigning each involved scalar stmt a
4039 corresponding SLP graph entry and upon visiting a previously
4040 marked stmt, make the stmts leader the current SLP graph entry. */
4041 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
4042 hash_map
<slp_instance
, slp_instance
> instance_leader
;
4043 hash_set
<slp_tree
> visited
;
4044 slp_instance instance
;
4045 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4047 instance_leader
.put (instance
, instance
);
4048 vect_bb_partition_graph_r (bb_vinfo
,
4049 instance
, SLP_INSTANCE_TREE (instance
),
4050 stmt_to_instance
, instance_leader
,
4054 /* Then collect entries to each independent subgraph. */
4055 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
4057 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
4058 leader
->subgraph_entries
.safe_push (instance
);
4059 if (dump_enabled_p ()
4060 && leader
!= instance
)
4061 dump_printf_loc (MSG_NOTE
, vect_location
,
4062 "instance %p is leader of %p\n",
4067 /* Compute the scalar cost of the SLP node NODE and its children
4068 and return it. Do not account defs that are marked in LIFE and
4069 update LIFE according to uses of NODE. */
4072 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
4073 slp_tree node
, vec
<bool, va_heap
> *life
,
4074 stmt_vector_for_cost
*cost_vec
,
4075 hash_set
<slp_tree
> &visited
)
4078 stmt_vec_info stmt_info
;
4081 if (visited
.add (node
))
4084 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
4086 ssa_op_iter op_iter
;
4087 def_operand_p def_p
;
4092 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
4093 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
4095 /* If there is a non-vectorized use of the defs then the scalar
4096 stmt is kept live in which case we do not account it or any
4097 required defs in the SLP children in the scalar cost. This
4098 way we make the vectorization more costly when compared to
4100 if (!STMT_VINFO_LIVE_P (stmt_info
))
4102 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
4104 imm_use_iterator use_iter
;
4106 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
4107 if (!is_gimple_debug (use_stmt
))
4109 stmt_vec_info use_stmt_info
= vinfo
->lookup_stmt (use_stmt
);
4112 (vect_stmt_to_vectorize (use_stmt_info
)))
4115 BREAK_FROM_IMM_USE_STMT (use_iter
);
4123 /* Count scalar stmts only once. */
4124 if (gimple_visited_p (orig_stmt
))
4126 gimple_set_visited (orig_stmt
, true);
4128 vect_cost_for_stmt kind
;
4129 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
4131 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
4134 kind
= scalar_store
;
4136 else if (vect_nop_conversion_p (orig_stmt_info
))
4140 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
4141 SLP_TREE_VECTYPE (node
), 0, vect_body
);
4144 auto_vec
<bool, 20> subtree_life
;
4145 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
4147 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
4149 /* Do not directly pass LIFE to the recursive call, copy it to
4150 confine changes in the callee to the current child/subtree. */
4151 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
4153 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
4154 for (unsigned j
= 0;
4155 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
4157 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
4158 if (perm
.first
== i
)
4159 subtree_life
[perm
.second
] = (*life
)[j
];
4164 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
4165 subtree_life
.safe_splice (*life
);
4167 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
4169 subtree_life
.truncate (0);
4174 /* Check if vectorization of the basic block is profitable for the
4175 subgraph denoted by SLP_INSTANCES. */
4178 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
4179 vec
<slp_instance
> slp_instances
)
4181 slp_instance instance
;
4183 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
4184 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
4186 void *vect_target_cost_data
= init_cost (NULL
);
4188 /* Calculate scalar cost and sum the cost for the vector stmts
4189 previously collected. */
4190 stmt_vector_for_cost scalar_costs
;
4191 scalar_costs
.create (0);
4192 hash_set
<slp_tree
> visited
;
4193 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
4195 auto_vec
<bool, 20> life
;
4196 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
4198 vect_bb_slp_scalar_cost (bb_vinfo
,
4199 SLP_INSTANCE_TREE (instance
),
4200 &life
, &scalar_costs
, visited
);
4201 add_stmt_costs (bb_vinfo
, vect_target_cost_data
, &instance
->cost_vec
);
4202 instance
->cost_vec
.release ();
4204 /* Unset visited flag. */
4205 stmt_info_for_cost
*si
;
4206 FOR_EACH_VEC_ELT (scalar_costs
, i
, si
)
4207 gimple_set_visited (si
->stmt_info
->stmt
, false);
4209 void *scalar_target_cost_data
= init_cost (NULL
);
4210 add_stmt_costs (bb_vinfo
, scalar_target_cost_data
, &scalar_costs
);
4211 scalar_costs
.release ();
4213 finish_cost (scalar_target_cost_data
, &dummy
, &scalar_cost
, &dummy
);
4214 destroy_cost_data (scalar_target_cost_data
);
4216 /* Complete the target-specific vector cost calculation. */
4217 finish_cost (vect_target_cost_data
, &vec_prologue_cost
,
4218 &vec_inside_cost
, &vec_epilogue_cost
);
4219 destroy_cost_data (vect_target_cost_data
);
4221 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
4223 if (dump_enabled_p ())
4225 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
4226 dump_printf (MSG_NOTE
, " Vector inside of basic block cost: %d\n",
4228 dump_printf (MSG_NOTE
, " Vector prologue cost: %d\n", vec_prologue_cost
);
4229 dump_printf (MSG_NOTE
, " Vector epilogue cost: %d\n", vec_epilogue_cost
);
4230 dump_printf (MSG_NOTE
, " Scalar cost of basic block: %d\n", scalar_cost
);
4233 /* Vectorization is profitable if its cost is more than the cost of scalar
4234 version. Note that we err on the vector side for equal cost because
4235 the cost estimate is otherwise quite pessimistic (constant uses are
4236 free on the scalar side but cost a load on the vector side for
4238 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
4244 /* qsort comparator for lane defs. */
4247 vld_cmp (const void *a_
, const void *b_
)
4249 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
4250 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
4251 return a
->first
- b
->first
;
4254 /* Return true if USE_STMT is a vector lane insert into VEC and set
4255 *THIS_LANE to the lane number that is set. */
4258 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
4260 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
4262 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
4264 ? gimple_assign_rhs1 (use_ass
) != vec
4265 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
4266 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
4267 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
4268 || !constant_multiple_p
4269 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
4270 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
4276 /* Find any vectorizable constructors and add them to the grouped_store
4280 vect_slp_check_for_constructors (bb_vec_info bb_vinfo
)
4282 for (unsigned i
= 0; i
< bb_vinfo
->bbs
.length (); ++i
)
4283 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
4284 !gsi_end_p (gsi
); gsi_next (&gsi
))
4286 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
4290 tree rhs
= gimple_assign_rhs1 (assign
);
4291 if (gimple_assign_rhs_code (assign
) == CONSTRUCTOR
)
4293 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
4294 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
4295 CONSTRUCTOR_NELTS (rhs
))
4296 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
4297 || uniform_vector_p (rhs
))
4302 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
4303 if (TREE_CODE (val
) != SSA_NAME
4304 || !bb_vinfo
->lookup_def (val
))
4306 if (j
!= CONSTRUCTOR_NELTS (rhs
))
4309 stmt_vec_info stmt_info
= bb_vinfo
->lookup_stmt (assign
);
4310 BB_VINFO_GROUPED_STORES (bb_vinfo
).safe_push (stmt_info
);
4312 else if (gimple_assign_rhs_code (assign
) == BIT_INSERT_EXPR
4313 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
4314 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
4315 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
4316 && integer_zerop (gimple_assign_rhs3 (assign
))
4317 && useless_type_conversion_p
4318 (TREE_TYPE (TREE_TYPE (rhs
)),
4319 TREE_TYPE (gimple_assign_rhs2 (assign
)))
4320 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
4322 /* We start to match on insert to lane zero but since the
4323 inserts need not be ordered we'd have to search both
4324 the def and the use chains. */
4325 tree vectype
= TREE_TYPE (rhs
);
4326 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
4327 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
4328 auto_sbitmap
lanes (nlanes
);
4329 bitmap_clear (lanes
);
4330 bitmap_set_bit (lanes
, 0);
4331 tree def
= gimple_assign_lhs (assign
);
4332 lane_defs
.quick_push
4333 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
4334 unsigned lanes_found
= 1;
4335 /* Start with the use chains, the last stmt will be the root. */
4336 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
4339 use_operand_p use_p
;
4341 if (!single_imm_use (def
, &use_p
, &use_stmt
))
4344 if (!bb_vinfo
->lookup_stmt (use_stmt
)
4345 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
4346 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
4348 if (bitmap_bit_p (lanes
, this_lane
))
4351 bitmap_set_bit (lanes
, this_lane
);
4352 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
4353 lane_defs
.quick_push (std::make_pair
4354 (this_lane
, gimple_assign_rhs2 (use_ass
)));
4355 last
= bb_vinfo
->lookup_stmt (use_ass
);
4356 def
= gimple_assign_lhs (use_ass
);
4358 while (lanes_found
< nlanes
);
4359 if (lanes_found
< nlanes
)
4361 /* Now search the def chain. */
4362 def
= gimple_assign_rhs1 (assign
);
4365 if (TREE_CODE (def
) != SSA_NAME
4366 || !has_single_use (def
))
4368 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
4370 if (!bb_vinfo
->lookup_stmt (def_stmt
)
4371 || !vect_slp_is_lane_insert (def_stmt
,
4372 NULL_TREE
, &this_lane
)
4373 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
4375 if (bitmap_bit_p (lanes
, this_lane
))
4378 bitmap_set_bit (lanes
, this_lane
);
4379 lane_defs
.quick_push (std::make_pair
4381 gimple_assign_rhs2 (def_stmt
)));
4382 def
= gimple_assign_rhs1 (def_stmt
);
4384 while (lanes_found
< nlanes
);
4386 if (lanes_found
== nlanes
)
4388 /* Sort lane_defs after the lane index and register the root. */
4389 lane_defs
.qsort (vld_cmp
);
4390 vec
<stmt_vec_info
> stmts
;
4391 stmts
.create (nlanes
);
4392 for (unsigned i
= 0; i
< nlanes
; ++i
)
4393 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
4394 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
4401 /* Walk the grouped store chains and replace entries with their
4402 pattern variant if any. */
4405 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
4407 stmt_vec_info first_element
;
4410 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
4412 /* We also have CTORs in this array. */
4413 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
4415 if (STMT_VINFO_IN_PATTERN_P (first_element
))
4417 stmt_vec_info orig
= first_element
;
4418 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
4419 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
4420 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
4421 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
4422 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
4423 vinfo
->grouped_stores
[i
] = first_element
;
4425 stmt_vec_info prev
= first_element
;
4426 while (DR_GROUP_NEXT_ELEMENT (prev
))
4428 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
4429 if (STMT_VINFO_IN_PATTERN_P (elt
))
4431 stmt_vec_info orig
= elt
;
4432 elt
= STMT_VINFO_RELATED_STMT (elt
);
4433 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
4434 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
4435 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
4437 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
4443 /* Check if the region described by BB_VINFO can be vectorized, returning
4444 true if so. When returning false, set FATAL to true if the same failure
4445 would prevent vectorization at other vector sizes, false if it is still
4446 worth trying other sizes. N_STMTS is the number of statements in the
4450 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
4451 vec
<int> *dataref_groups
)
4453 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
4455 slp_instance instance
;
4457 poly_uint64 min_vf
= 2;
4459 /* The first group of checks is independent of the vector size. */
4462 /* Analyze the data references. */
4464 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
4466 if (dump_enabled_p ())
4467 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4468 "not vectorized: unhandled data-ref in basic "
4473 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
4475 if (dump_enabled_p ())
4476 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4477 "not vectorized: unhandled data access in "
4482 vect_slp_check_for_constructors (bb_vinfo
);
4484 /* If there are no grouped stores and no constructors in the region
4485 there is no need to continue with pattern recog as vect_analyze_slp
4486 will fail anyway. */
4487 if (bb_vinfo
->grouped_stores
.is_empty ()
4488 && bb_vinfo
->roots
.is_empty ())
4490 if (dump_enabled_p ())
4491 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4492 "not vectorized: no grouped stores in "
4497 /* While the rest of the analysis below depends on it in some way. */
4500 vect_pattern_recog (bb_vinfo
);
4502 /* Update store groups from pattern processing. */
4503 vect_fixup_store_groups_with_patterns (bb_vinfo
);
4505 /* Check the SLP opportunities in the basic block, analyze and build SLP
4507 if (!vect_analyze_slp (bb_vinfo
, n_stmts
))
4509 if (dump_enabled_p ())
4511 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4512 "Failed to SLP the basic block.\n");
4513 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4514 "not vectorized: failed to find SLP opportunities "
4515 "in basic block.\n");
4520 /* Optimize permutations. */
4521 vect_optimize_slp (bb_vinfo
);
4523 /* Gather the loads reachable from the SLP graph entries. */
4524 vect_gather_slp_loads (bb_vinfo
);
4526 vect_record_base_alignments (bb_vinfo
);
4528 /* Analyze and verify the alignment of data references and the
4529 dependence in the SLP instances. */
4530 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
4532 vect_location
= instance
->location ();
4533 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
4534 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
4536 slp_tree node
= SLP_INSTANCE_TREE (instance
);
4537 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
4538 if (dump_enabled_p ())
4539 dump_printf_loc (MSG_NOTE
, vect_location
,
4540 "removing SLP instance operations starting from: %G",
4542 vect_free_slp_instance (instance
);
4543 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
4547 /* Mark all the statements that we want to vectorize as pure SLP and
4549 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance
));
4550 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
4551 if (stmt_vec_info root
= SLP_INSTANCE_ROOT_STMT (instance
))
4553 STMT_SLP_TYPE (root
) = pure_slp
;
4554 if (is_gimple_assign (root
->stmt
)
4555 && gimple_assign_rhs_code (root
->stmt
) == BIT_INSERT_EXPR
)
4557 /* ??? We should probably record the whole vector of
4558 root stmts so we do not have to back-track here... */
4559 for (unsigned n
= SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
));
4562 root
= bb_vinfo
->lookup_def (gimple_assign_rhs1 (root
->stmt
));
4563 STMT_SLP_TYPE (root
) = pure_slp
;
4570 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
4573 if (!vect_slp_analyze_operations (bb_vinfo
))
4575 if (dump_enabled_p ())
4576 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4577 "not vectorized: bad operation in basic block.\n");
4581 vect_bb_partition_graph (bb_vinfo
);
4586 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
4587 basic blocks in BBS, returning true on success.
4588 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
4591 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
4592 vec
<int> *dataref_groups
, unsigned int n_stmts
)
4594 bb_vec_info bb_vinfo
;
4595 auto_vector_modes vector_modes
;
4597 /* Autodetect first vector size we try. */
4598 machine_mode next_vector_mode
= VOIDmode
;
4599 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
4600 unsigned int mode_i
= 0;
4602 vec_info_shared shared
;
4604 machine_mode autodetected_vector_mode
= VOIDmode
;
4607 bool vectorized
= false;
4609 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
4611 bool first_time_p
= shared
.datarefs
.is_empty ();
4612 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
4614 bb_vinfo
->shared
->save_datarefs ();
4616 bb_vinfo
->shared
->check_datarefs ();
4617 bb_vinfo
->vector_mode
= next_vector_mode
;
4619 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
4621 if (dump_enabled_p ())
4623 dump_printf_loc (MSG_NOTE
, vect_location
,
4624 "***** Analysis succeeded with vector mode"
4625 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
4626 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
4629 bb_vinfo
->shared
->check_datarefs ();
4632 slp_instance instance
;
4633 FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo
), i
, instance
)
4635 if (instance
->subgraph_entries
.is_empty ())
4638 vect_location
= instance
->location ();
4639 if (!unlimited_cost_model (NULL
)
4640 && !vect_bb_vectorization_profitable_p
4641 (bb_vinfo
, instance
->subgraph_entries
))
4643 if (dump_enabled_p ())
4644 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4645 "not vectorized: vectorization is not "
4650 if (!dbg_cnt (vect_slp
))
4653 if (!vectorized
&& dump_enabled_p ())
4654 dump_printf_loc (MSG_NOTE
, vect_location
,
4655 "Basic block will be vectorized "
4659 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
4661 unsigned HOST_WIDE_INT bytes
;
4662 if (dump_enabled_p ())
4665 (bb_vinfo
->vector_mode
).is_constant (&bytes
))
4666 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4667 "basic block part vectorized using %wu "
4668 "byte vectors\n", bytes
);
4670 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, vect_location
,
4671 "basic block part vectorized using "
4672 "variable length vectors\n");
4678 if (dump_enabled_p ())
4679 dump_printf_loc (MSG_NOTE
, vect_location
,
4680 "***** Analysis failed with vector mode %s\n",
4681 GET_MODE_NAME (bb_vinfo
->vector_mode
));
4685 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
4688 while (mode_i
< vector_modes
.length ()
4689 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
4691 if (dump_enabled_p ())
4692 dump_printf_loc (MSG_NOTE
, vect_location
,
4693 "***** The result for vector mode %s would"
4695 GET_MODE_NAME (vector_modes
[mode_i
]));
4701 if (mode_i
< vector_modes
.length ()
4702 && VECTOR_MODE_P (autodetected_vector_mode
)
4703 && (related_vector_mode (vector_modes
[mode_i
],
4704 GET_MODE_INNER (autodetected_vector_mode
))
4705 == autodetected_vector_mode
)
4706 && (related_vector_mode (autodetected_vector_mode
,
4707 GET_MODE_INNER (vector_modes
[mode_i
]))
4708 == vector_modes
[mode_i
]))
4710 if (dump_enabled_p ())
4711 dump_printf_loc (MSG_NOTE
, vect_location
,
4712 "***** Skipping vector mode %s, which would"
4713 " repeat the analysis for %s\n",
4714 GET_MODE_NAME (vector_modes
[mode_i
]),
4715 GET_MODE_NAME (autodetected_vector_mode
));
4720 || mode_i
== vector_modes
.length ()
4721 || autodetected_vector_mode
== VOIDmode
4722 /* If vect_slp_analyze_bb_1 signaled that analysis for all
4723 vector sizes will fail do not bother iterating. */
4727 /* Try the next biggest vector size. */
4728 next_vector_mode
= vector_modes
[mode_i
++];
4729 if (dump_enabled_p ())
4730 dump_printf_loc (MSG_NOTE
, vect_location
,
4731 "***** Re-trying analysis with vector mode %s\n",
4732 GET_MODE_NAME (next_vector_mode
));
4737 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
4738 true if anything in the basic-block was vectorized. */
4741 vect_slp_bbs (vec
<basic_block
> bbs
)
4743 vec
<data_reference_p
> datarefs
= vNULL
;
4744 auto_vec
<int> dataref_groups
;
4746 int current_group
= 0;
4748 for (unsigned i
= 0; i
< bbs
.length (); i
++)
4750 basic_block bb
= bbs
[i
];
4751 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
4754 gimple
*stmt
= gsi_stmt (gsi
);
4755 if (is_gimple_debug (stmt
))
4760 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
4761 vect_location
= stmt
;
4763 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
4764 &dataref_groups
, current_group
))
4769 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
);
4772 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4773 true if anything in the basic-block was vectorized. */
4776 vect_slp_bb (basic_block bb
)
4778 auto_vec
<basic_block
> bbs
;
4780 return vect_slp_bbs (bbs
);
4783 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
4784 true if anything in the basic-block was vectorized. */
4787 vect_slp_function (function
*fun
)
4790 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
4791 unsigned n
= pre_and_rev_post_order_compute_fn (fun
, NULL
, rpo
, false);
4793 /* For the moment split the function into pieces to avoid making
4794 the iteration on the vector mode moot. Split at points we know
4795 to not handle well which is CFG merges (SLP discovery doesn't
4796 handle non-loop-header PHIs) and loop exits. Since pattern
4797 recog requires reverse iteration to visit uses before defs
4798 simply chop RPO into pieces. */
4799 auto_vec
<basic_block
> bbs
;
4800 for (unsigned i
= 0; i
< n
; i
++)
4802 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
4805 /* Split when a BB is not dominated by the first block. */
4806 if (!bbs
.is_empty ()
4807 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
4809 if (dump_enabled_p ())
4810 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4811 "splitting region at dominance boundary bb%d\n",
4815 /* Split when the loop determined by the first block
4816 is exited. This is because we eventually insert
4817 invariants at region begin. */
4818 else if (!bbs
.is_empty ()
4819 && bbs
[0]->loop_father
!= bb
->loop_father
4820 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
4822 if (dump_enabled_p ())
4823 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4824 "splitting region at loop %d exit at bb%d\n",
4825 bbs
[0]->loop_father
->num
, bb
->index
);
4829 if (split
&& !bbs
.is_empty ())
4831 r
|= vect_slp_bbs (bbs
);
4833 bbs
.quick_push (bb
);
4838 /* When we have a stmt ending this block and defining a
4839 value we have to insert on edges when inserting after it for
4840 a vector containing its definition. Avoid this for now. */
4841 if (gimple
*last
= last_stmt (bb
))
4842 if (gimple_get_lhs (last
)
4843 && is_ctrl_altering_stmt (last
))
4845 if (dump_enabled_p ())
4846 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
4847 "splitting region at control altering "
4848 "definition %G", last
);
4849 r
|= vect_slp_bbs (bbs
);
4854 if (!bbs
.is_empty ())
4855 r
|= vect_slp_bbs (bbs
);
4862 /* Build a variable-length vector in which the elements in ELTS are repeated
4863 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
4864 RESULTS and add any new instructions to SEQ.
4866 The approach we use is:
4868 (1) Find a vector mode VM with integer elements of mode IM.
4870 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4871 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
4872 from small vectors to IM.
4874 (3) Duplicate each ELTS'[I] into a vector of mode VM.
4876 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
4877 correct byte contents.
4879 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
4881 We try to find the largest IM for which this sequence works, in order
4882 to cut down on the number of interleaves. */
4885 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
4886 vec
<tree
> elts
, unsigned int nresults
,
4889 unsigned int nelts
= elts
.length ();
4890 tree element_type
= TREE_TYPE (vector_type
);
4892 /* (1) Find a vector mode VM with integer elements of mode IM. */
4893 unsigned int nvectors
= 1;
4894 tree new_vector_type
;
4896 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
4897 &nvectors
, &new_vector_type
,
4901 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
4902 unsigned int partial_nelts
= nelts
/ nvectors
;
4903 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
4905 tree_vector_builder partial_elts
;
4906 auto_vec
<tree
, 32> pieces (nvectors
* 2);
4907 pieces
.quick_grow (nvectors
* 2);
4908 for (unsigned int i
= 0; i
< nvectors
; ++i
)
4910 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
4911 ELTS' has mode IM. */
4912 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
4913 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
4914 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
4915 tree t
= gimple_build_vector (seq
, &partial_elts
);
4916 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
4917 TREE_TYPE (new_vector_type
), t
);
4919 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
4920 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
4923 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
4924 correct byte contents.
4926 We need to repeat the following operation log2(nvectors) times:
4928 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
4929 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
4931 However, if each input repeats every N elements and the VF is
4932 a multiple of N * 2, the HI result is the same as the LO. */
4933 unsigned int in_start
= 0;
4934 unsigned int out_start
= nvectors
;
4935 unsigned int hi_start
= nvectors
/ 2;
4936 /* A bound on the number of outputs needed to produce NRESULTS results
4937 in the final iteration. */
4938 unsigned int noutputs_bound
= nvectors
* nresults
;
4939 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
4941 noutputs_bound
/= 2;
4942 unsigned int limit
= MIN (noutputs_bound
, nvectors
);
4943 for (unsigned int i
= 0; i
< limit
; ++i
)
4946 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
4949 pieces
[out_start
+ i
] = pieces
[out_start
+ i
- 1];
4953 tree output
= make_ssa_name (new_vector_type
);
4954 tree input1
= pieces
[in_start
+ (i
/ 2)];
4955 tree input2
= pieces
[in_start
+ (i
/ 2) + hi_start
];
4956 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
4959 gimple_seq_add_stmt (seq
, stmt
);
4960 pieces
[out_start
+ i
] = output
;
4962 std::swap (in_start
, out_start
);
4965 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
4966 results
.reserve (nresults
);
4967 for (unsigned int i
= 0; i
< nresults
; ++i
)
4969 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
4970 pieces
[in_start
+ i
]));
4972 results
.quick_push (results
[i
- nvectors
]);
4976 /* For constant and loop invariant defs in OP_NODE this function creates
4977 vector defs that will be used in the vectorized stmts and stores them
4978 to SLP_TREE_VEC_DEFS of OP_NODE. */
4981 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
4983 unsigned HOST_WIDE_INT nunits
;
4985 unsigned j
, number_of_places_left_in_vector
;
4988 int group_size
= op_node
->ops
.length ();
4989 unsigned int vec_num
, i
;
4990 unsigned number_of_copies
= 1;
4992 gimple_seq ctor_seq
= NULL
;
4993 auto_vec
<tree
, 16> permute_results
;
4995 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
4996 vector_type
= SLP_TREE_VECTYPE (op_node
);
4998 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
4999 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
5000 auto_vec
<tree
> voprnds (number_of_vectors
);
5002 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5003 created vectors. It is greater than 1 if unrolling is performed.
5005 For example, we have two scalar operands, s1 and s2 (e.g., group of
5006 strided accesses of size two), while NUNITS is four (i.e., four scalars
5007 of this type can be packed in a vector). The output vector will contain
5008 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5011 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
5012 containing the operands.
5014 For example, NUNITS is four as before, and the group size is 8
5015 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5016 {s5, s6, s7, s8}. */
5018 /* When using duplicate_and_interleave, we just need one element for
5019 each scalar statement. */
5020 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
5021 nunits
= group_size
;
5023 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
5025 number_of_places_left_in_vector
= nunits
;
5027 tree_vector_builder
elts (vector_type
, nunits
, 1);
5028 elts
.quick_grow (nunits
);
5029 stmt_vec_info insert_after
= NULL
;
5030 for (j
= 0; j
< number_of_copies
; j
++)
5033 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
5035 /* Create 'vect_ = {op0,op1,...,opn}'. */
5036 number_of_places_left_in_vector
--;
5038 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
5040 if (CONSTANT_CLASS_P (op
))
5042 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5044 /* Can't use VIEW_CONVERT_EXPR for booleans because
5045 of possibly different sizes of scalar value and
5047 if (integer_zerop (op
))
5048 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
5049 else if (integer_onep (op
))
5050 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
5055 op
= fold_unary (VIEW_CONVERT_EXPR
,
5056 TREE_TYPE (vector_type
), op
);
5057 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
5061 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
5063 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
5066 = build_all_ones_cst (TREE_TYPE (vector_type
));
5068 = build_zero_cst (TREE_TYPE (vector_type
));
5069 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
5070 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
5076 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
5079 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
5082 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
5086 elts
[number_of_places_left_in_vector
] = op
;
5087 if (!CONSTANT_CLASS_P (op
))
5089 /* For BB vectorization we have to compute an insert location
5090 when a def is inside the analyzed region since we cannot
5091 simply insert at the BB start in this case. */
5092 stmt_vec_info opdef
;
5093 if (TREE_CODE (orig_op
) == SSA_NAME
5094 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
5095 && is_a
<bb_vec_info
> (vinfo
)
5096 && (opdef
= vinfo
->lookup_def (orig_op
)))
5099 insert_after
= opdef
;
5101 insert_after
= get_later_stmt (insert_after
, opdef
);
5104 if (number_of_places_left_in_vector
== 0)
5107 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
)
5108 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type
), nunits
))
5109 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
5112 if (permute_results
.is_empty ())
5113 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
5114 elts
, number_of_vectors
,
5116 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
5118 if (!gimple_seq_empty_p (ctor_seq
))
5122 gimple_stmt_iterator gsi
;
5123 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
5125 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
5126 gsi_insert_seq_before (&gsi
, ctor_seq
,
5127 GSI_CONTINUE_LINKING
);
5129 else if (!stmt_ends_bb_p (insert_after
->stmt
))
5131 gsi
= gsi_for_stmt (insert_after
->stmt
);
5132 gsi_insert_seq_after (&gsi
, ctor_seq
,
5133 GSI_CONTINUE_LINKING
);
5137 /* When we want to insert after a def where the
5138 defining stmt throws then insert on the fallthru
5140 edge e
= find_fallthru_edge
5141 (gimple_bb (insert_after
->stmt
)->succs
);
5143 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
5144 gcc_assert (!new_bb
);
5148 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
5151 voprnds
.quick_push (vec_cst
);
5152 insert_after
= NULL
;
5153 number_of_places_left_in_vector
= nunits
;
5155 elts
.new_vector (vector_type
, nunits
, 1);
5156 elts
.quick_grow (nunits
);
5161 /* Since the vectors are created in the reverse order, we should invert
5163 vec_num
= voprnds
.length ();
5164 for (j
= vec_num
; j
!= 0; j
--)
5166 vop
= voprnds
[j
- 1];
5167 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5170 /* In case that VF is greater than the unrolling factor needed for the SLP
5171 group of stmts, NUMBER_OF_VECTORS to be created is greater than
5172 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
5173 to replicate the vectors. */
5174 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
5175 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
5177 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
5180 /* Get the Ith vectorized definition from SLP_NODE. */
5183 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
5185 if (SLP_TREE_VEC_STMTS (slp_node
).exists ())
5186 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node
)[i
]);
5188 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
5191 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
5194 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
5196 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
5197 if (SLP_TREE_DEF_TYPE (slp_node
) == vect_internal_def
)
5200 gimple
*vec_def_stmt
;
5201 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node
), j
, vec_def_stmt
)
5202 vec_defs
->quick_push (gimple_get_lhs (vec_def_stmt
));
5205 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
5208 /* Get N vectorized definitions for SLP_NODE. */
5211 vect_get_slp_defs (vec_info
*,
5212 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
5215 n
= SLP_TREE_CHILDREN (slp_node
).length ();
5217 for (unsigned i
= 0; i
< n
; ++i
)
5219 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
5220 vec
<tree
> vec_defs
= vNULL
;
5221 vect_get_slp_defs (child
, &vec_defs
);
5222 vec_oprnds
->quick_push (vec_defs
);
5226 /* Generate vector permute statements from a list of loads in DR_CHAIN.
5227 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
5228 permute statements for the SLP node NODE. Store the number of vector
5229 permute instructions in *N_PERMS and the number of vector load
5230 instructions in *N_LOADS. */
5233 vect_transform_slp_perm_load (vec_info
*vinfo
,
5234 slp_tree node
, vec
<tree
> dr_chain
,
5235 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
5236 bool analyze_only
, unsigned *n_perms
,
5237 unsigned int *n_loads
)
5239 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
5241 tree vectype
= STMT_VINFO_VECTYPE (stmt_info
);
5242 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
5243 unsigned int mask_element
;
5246 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
5249 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
5251 mode
= TYPE_MODE (vectype
);
5252 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5254 /* Initialize the vect stmts of NODE to properly insert the generated
5257 for (unsigned i
= SLP_TREE_VEC_STMTS (node
).length ();
5258 i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); i
++)
5259 SLP_TREE_VEC_STMTS (node
).quick_push (NULL
);
5261 /* Generate permutation masks for every NODE. Number of masks for each NODE
5262 is equal to GROUP_SIZE.
5263 E.g., we have a group of three nodes with three loads from the same
5264 location in each node, and the vector size is 4. I.e., we have a
5265 a0b0c0a1b1c1... sequence and we need to create the following vectors:
5266 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
5267 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
5270 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
5271 The last mask is illegal since we assume two operands for permute
5272 operation, and the mask element values can't be outside that range.
5273 Hence, the last mask must be converted into {2,5,5,5}.
5274 For the first two permutations we need the first and the second input
5275 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
5276 we need the second and the third vectors: {b1,c1,a2,b2} and
5279 int vect_stmts_counter
= 0;
5280 unsigned int index
= 0;
5281 int first_vec_index
= -1;
5282 int second_vec_index
= -1;
5286 vec_perm_builder mask
;
5287 unsigned int nelts_to_build
;
5288 unsigned int nvectors_per_build
;
5289 unsigned int in_nlanes
;
5290 bool repeating_p
= (group_size
== DR_GROUP_SIZE (stmt_info
)
5291 && multiple_p (nunits
, group_size
));
5294 /* A single vector contains a whole number of copies of the node, so:
5295 (a) all permutes can use the same mask; and
5296 (b) the permutes only need a single vector input. */
5297 mask
.new_vector (nunits
, group_size
, 3);
5298 nelts_to_build
= mask
.encoded_nelts ();
5299 nvectors_per_build
= SLP_TREE_VEC_STMTS (node
).length ();
5300 in_nlanes
= DR_GROUP_SIZE (stmt_info
) * 3;
5304 /* We need to construct a separate mask for each vector statement. */
5305 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
5306 if (!nunits
.is_constant (&const_nunits
)
5307 || !vf
.is_constant (&const_vf
))
5309 mask
.new_vector (const_nunits
, const_nunits
, 1);
5310 nelts_to_build
= const_vf
* group_size
;
5311 nvectors_per_build
= 1;
5312 in_nlanes
= const_vf
* DR_GROUP_SIZE (stmt_info
);
5314 auto_sbitmap
used_in_lanes (in_nlanes
);
5315 bitmap_clear (used_in_lanes
);
5317 unsigned int count
= mask
.encoded_nelts ();
5318 mask
.quick_grow (count
);
5319 vec_perm_indices indices
;
5321 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
5323 unsigned int iter_num
= j
/ group_size
;
5324 unsigned int stmt_num
= j
% group_size
;
5325 unsigned int i
= (iter_num
* DR_GROUP_SIZE (stmt_info
)
5326 + SLP_TREE_LOAD_PERMUTATION (node
)[stmt_num
]);
5327 bitmap_set_bit (used_in_lanes
, i
);
5330 first_vec_index
= 0;
5335 /* Enforced before the loop when !repeating_p. */
5336 unsigned int const_nunits
= nunits
.to_constant ();
5337 vec_index
= i
/ const_nunits
;
5338 mask_element
= i
% const_nunits
;
5339 if (vec_index
== first_vec_index
5340 || first_vec_index
== -1)
5342 first_vec_index
= vec_index
;
5344 else if (vec_index
== second_vec_index
5345 || second_vec_index
== -1)
5347 second_vec_index
= vec_index
;
5348 mask_element
+= const_nunits
;
5352 if (dump_enabled_p ())
5353 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5354 "permutation requires at "
5355 "least three vectors %G",
5357 gcc_assert (analyze_only
);
5361 gcc_assert (mask_element
< 2 * const_nunits
);
5364 if (mask_element
!= index
)
5366 mask
[index
++] = mask_element
;
5368 if (index
== count
&& !noop_p
)
5370 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
5371 if (!can_vec_perm_const_p (mode
, indices
))
5373 if (dump_enabled_p ())
5375 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5377 "unsupported vect permute { ");
5378 for (i
= 0; i
< count
; ++i
)
5380 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5381 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5383 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5385 gcc_assert (analyze_only
);
5396 tree mask_vec
= NULL_TREE
;
5399 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5401 if (second_vec_index
== -1)
5402 second_vec_index
= first_vec_index
;
5404 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
5406 /* Generate the permute statement if necessary. */
5407 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
5408 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
5412 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
5414 = vect_create_destination_var (gimple_assign_lhs (stmt
),
5416 perm_dest
= make_ssa_name (perm_dest
);
5418 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5419 first_vec
, second_vec
,
5421 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
5425 /* If mask was NULL_TREE generate the requested
5426 identity transform. */
5427 perm_stmt
= SSA_NAME_DEF_STMT (first_vec
);
5429 /* Store the vector statement in NODE. */
5430 SLP_TREE_VEC_STMTS (node
)[vect_stmts_counter
++] = perm_stmt
;
5435 first_vec_index
= -1;
5436 second_vec_index
= -1;
5444 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5447 /* Enforced above when !repeating_p. */
5448 unsigned int const_nunits
= nunits
.to_constant ();
5450 bool load_seen
= false;
5451 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
5453 if (i
% const_nunits
== 0)
5459 if (bitmap_bit_p (used_in_lanes
, i
))
5471 /* Vectorize the SLP permutations in NODE as specified
5472 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
5473 child number and lane number.
5474 Interleaving of two two-lane two-child SLP subtrees (not supported):
5475 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
5476 A blend of two four-lane two-child SLP subtrees:
5477 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
5478 Highpart of a four-lane one-child SLP subtree (not supported):
5479 [ { 0, 2 }, { 0, 3 } ]
5480 Where currently only a subset is supported by code generating below. */
5483 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
5484 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
5486 tree vectype
= SLP_TREE_VECTYPE (node
);
5488 /* ??? We currently only support all same vector input and output types
5489 while the SLP IL should really do a concat + select and thus accept
5490 arbitrary mismatches. */
5493 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5494 if (!vect_maybe_update_slp_op_vectype (child
, vectype
)
5495 || !types_compatible_p (SLP_TREE_VECTYPE (child
), vectype
))
5497 if (dump_enabled_p ())
5498 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5499 "Unsupported lane permutation\n");
5503 vec
<std::pair
<unsigned, unsigned> > &perm
= SLP_TREE_LANE_PERMUTATION (node
);
5504 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
5505 if (dump_enabled_p ())
5507 dump_printf_loc (MSG_NOTE
, vect_location
,
5508 "vectorizing permutation");
5509 for (unsigned i
= 0; i
< perm
.length (); ++i
)
5510 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
5511 dump_printf (MSG_NOTE
, "\n");
5514 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
5515 if (!nunits
.is_constant ())
5517 unsigned HOST_WIDE_INT vf
= 1;
5518 if (loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5519 if (!LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&vf
))
5521 unsigned olanes
= vf
* SLP_TREE_LANES (node
);
5522 gcc_assert (multiple_p (olanes
, nunits
));
5524 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
5525 from the { SLP operand, scalar lane } permutation as recorded in the
5526 SLP node as intermediate step. This part should already work
5527 with SLP children with arbitrary number of lanes. */
5528 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, unsigned> > vperm
;
5529 auto_vec
<unsigned> active_lane
;
5530 vperm
.create (olanes
);
5531 active_lane
.safe_grow_cleared (SLP_TREE_CHILDREN (node
).length (), true);
5532 for (unsigned i
= 0; i
< vf
; ++i
)
5534 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
5536 std::pair
<unsigned, unsigned> p
= perm
[pi
];
5537 tree vtype
= SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node
)[p
.first
]);
5538 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
).to_constant ();
5539 unsigned vi
= (active_lane
[p
.first
] + p
.second
) / vnunits
;
5540 unsigned vl
= (active_lane
[p
.first
] + p
.second
) % vnunits
;
5541 vperm
.quick_push (std::make_pair (std::make_pair (p
.first
, vi
), vl
));
5543 /* Advance to the next group. */
5544 for (unsigned j
= 0; j
< SLP_TREE_CHILDREN (node
).length (); ++j
)
5545 active_lane
[j
] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node
)[j
]);
5548 if (dump_enabled_p ())
5550 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
5551 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5553 if (i
!= 0 && multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
)))
5554 dump_printf (MSG_NOTE
, ",");
5555 dump_printf (MSG_NOTE
, " vops%u[%u][%u]",
5556 vperm
[i
].first
.first
, vperm
[i
].first
.second
,
5557 vperm
[i
].first
.second
);
5559 dump_printf (MSG_NOTE
, "\n");
5562 /* We can only handle two-vector permutes, everything else should
5563 be lowered on the SLP level. The following is closely inspired
5564 by vect_transform_slp_perm_load and is supposed to eventually
5566 ??? As intermediate step do code-gen in the SLP tree representation
5568 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
5569 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
5570 unsigned int const_nunits
= nunits
.to_constant ();
5571 unsigned int index
= 0;
5572 unsigned int mask_element
;
5573 vec_perm_builder mask
;
5574 mask
.new_vector (const_nunits
, const_nunits
, 1);
5575 unsigned int count
= mask
.encoded_nelts ();
5576 mask
.quick_grow (count
);
5577 vec_perm_indices indices
;
5578 unsigned nperms
= 0;
5579 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
5581 mask_element
= vperm
[i
].second
;
5582 if (first_vec
.first
== -1U
5583 || first_vec
== vperm
[i
].first
)
5584 first_vec
= vperm
[i
].first
;
5585 else if (second_vec
.first
== -1U
5586 || second_vec
== vperm
[i
].first
)
5588 second_vec
= vperm
[i
].first
;
5589 mask_element
+= const_nunits
;
5593 if (dump_enabled_p ())
5594 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
5595 "permutation requires at "
5596 "least three vectors\n");
5601 mask
[index
++] = mask_element
;
5605 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
5607 bool identity_p
= indices
.series_p (0, 1, 0, 1);
5609 && !can_vec_perm_const_p (TYPE_MODE (vectype
), indices
))
5611 if (dump_enabled_p ())
5613 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
5615 "unsupported vect permute { ");
5616 for (i
= 0; i
< count
; ++i
)
5618 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
5619 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
5621 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
5631 if (second_vec
.first
== -1U)
5632 second_vec
= first_vec
;
5634 /* Generate the permute statement if necessary. */
5635 slp_tree first_node
= SLP_TREE_CHILDREN (node
)[first_vec
.first
];
5637 = vect_get_slp_vect_def (first_node
, first_vec
.second
);
5639 tree perm_dest
= make_ssa_name (vectype
);
5642 slp_tree second_node
5643 = SLP_TREE_CHILDREN (node
)[second_vec
.first
];
5645 = vect_get_slp_vect_def (second_node
, second_vec
.second
);
5646 tree mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
5647 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
5648 first_def
, second_def
,
5652 /* We need a copy here in case the def was external. */
5653 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
5654 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
5655 /* Store the vector statement in NODE. */
5656 SLP_TREE_VEC_STMTS (node
).quick_push (perm_stmt
);
5660 first_vec
= std::make_pair (-1U, -1U);
5661 second_vec
= std::make_pair (-1U, -1U);
5666 record_stmt_cost (cost_vec
, nperms
, vec_perm
, NULL
, vectype
, 0, vect_body
);
5671 /* Vectorize SLP NODE. */
5674 vect_schedule_slp_node (vec_info
*vinfo
,
5675 slp_tree node
, slp_instance instance
)
5677 gimple_stmt_iterator si
;
5681 /* For existing vectors there's nothing to do. */
5682 if (SLP_TREE_VEC_DEFS (node
).exists ())
5685 gcc_assert (SLP_TREE_VEC_STMTS (node
).is_empty ());
5687 /* Vectorize externals and constants. */
5688 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
5689 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
5691 /* ??? vectorizable_shift can end up using a scalar operand which is
5692 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
5693 node in this case. */
5694 if (!SLP_TREE_VECTYPE (node
))
5697 vect_create_constant_vectors (vinfo
, node
);
5701 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
5703 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
5704 SLP_TREE_VEC_STMTS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
5706 if (dump_enabled_p ())
5707 dump_printf_loc (MSG_NOTE
, vect_location
,
5708 "------>vectorizing SLP node starting from: %G",
5711 if (STMT_VINFO_DATA_REF (stmt_info
)
5712 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5714 /* Vectorized loads go before the first scalar load to make it
5715 ready early, vectorized stores go before the last scalar
5716 stmt which is where all uses are ready. */
5717 stmt_vec_info last_stmt_info
= NULL
;
5718 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
5719 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
5720 else /* DR_IS_WRITE */
5721 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
5722 si
= gsi_for_stmt (last_stmt_info
->stmt
);
5724 else if ((STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
5725 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
5726 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
)
5727 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
5729 /* For PHI node vectorization we do not use the insertion iterator. */
5734 /* Emit other stmts after the children vectorized defs which is
5735 earliest possible. */
5736 gimple
*last_stmt
= NULL
;
5737 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5738 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
5740 /* For fold-left reductions we are retaining the scalar
5741 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
5742 set so the representation isn't perfect. Resort to the
5743 last scalar def here. */
5744 if (SLP_TREE_VEC_STMTS (child
).is_empty ())
5746 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
5747 == cycle_phi_info_type
);
5748 gphi
*phi
= as_a
<gphi
*>
5749 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
5751 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
5754 /* We are emitting all vectorized stmts in the same place and
5755 the last one is the last.
5756 ??? Unless we have a load permutation applied and that
5757 figures to re-use an earlier generated load. */
5760 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child
), j
, vstmt
)
5762 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5765 else if (!SLP_TREE_VECTYPE (child
))
5767 /* For externals we use unvectorized at all scalar defs. */
5770 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
5771 if (TREE_CODE (def
) == SSA_NAME
5772 && !SSA_NAME_IS_DEFAULT_DEF (def
))
5774 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
5776 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
5782 /* For externals we have to look at all defs since their
5783 insertion place is decided per vector. But beware
5784 of pre-existing vectors where we need to make sure
5785 we do not insert before the region boundary. */
5786 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
5787 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
5788 last_stmt
= gsi_stmt (gsi_after_labels
5789 (as_a
<bb_vec_info
> (vinfo
)->bbs
[0]));
5794 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
5795 if (TREE_CODE (vdef
) == SSA_NAME
5796 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
5798 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
5800 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
5805 /* This can happen when all children are pre-existing vectors or
5808 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
5809 if (is_a
<gphi
*> (last_stmt
))
5810 si
= gsi_after_labels (gimple_bb (last_stmt
));
5813 si
= gsi_for_stmt (last_stmt
);
5818 bool done_p
= false;
5820 /* Handle purely internal nodes. */
5821 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
5823 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
5824 be shared with different SLP nodes (but usually it's the same
5825 operation apart from the case the stmt is only there for denoting
5826 the actual scalar lane defs ...). So do not call vect_transform_stmt
5827 but open-code it here (partly). */
5828 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
5833 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
5836 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
5837 For loop vectorization this is done in vectorizable_call, but for SLP
5838 it needs to be deferred until end of vect_schedule_slp, because multiple
5839 SLP instances may refer to the same scalar stmt. */
5842 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
5843 slp_tree node
, hash_set
<slp_tree
> &visited
)
5846 gimple_stmt_iterator gsi
;
5850 stmt_vec_info stmt_info
;
5852 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
5855 if (visited
.add (node
))
5858 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5859 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
5861 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
5863 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
5864 if (!stmt
|| gimple_bb (stmt
) == NULL
)
5866 if (is_pattern_stmt_p (stmt_info
)
5867 || !PURE_SLP_STMT (stmt_info
))
5869 lhs
= gimple_call_lhs (stmt
);
5870 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
5871 gsi
= gsi_for_stmt (stmt
);
5872 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
5873 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt
)) = new_stmt
;
5878 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
5880 hash_set
<slp_tree
> visited
;
5881 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
5884 /* Vectorize the instance root. */
5887 vectorize_slp_instance_root_stmt (slp_tree node
, slp_instance instance
)
5889 gassign
*rstmt
= NULL
;
5891 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
5896 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5898 tree vect_lhs
= gimple_get_lhs (child_stmt
);
5899 tree root_lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5900 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
5901 TREE_TYPE (vect_lhs
)))
5902 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
5904 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
5908 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
5910 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
5913 vec
<constructor_elt
, va_gc
> *v
;
5914 vec_alloc (v
, nelts
);
5916 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node
), j
, child_stmt
)
5918 CONSTRUCTOR_APPEND_ELT (v
,
5920 gimple_get_lhs (child_stmt
));
5922 tree lhs
= gimple_get_lhs (instance
->root_stmt
->stmt
);
5923 tree rtype
= TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmt
->stmt
));
5924 tree r_constructor
= build_constructor (rtype
, v
);
5925 rstmt
= gimple_build_assign (lhs
, r_constructor
);
5930 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmt
->stmt
);
5931 gsi_replace (&rgsi
, rstmt
, true);
5941 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
5944 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
5945 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
5946 int &maxdfs
, vec
<slp_tree
> &stack
)
5949 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
5950 gcc_assert (!existed_p
);
5952 info
->lowlink
= maxdfs
;
5956 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
5958 info
->on_stack
= false;
5959 vect_schedule_slp_node (vinfo
, node
, instance
);
5963 info
->on_stack
= true;
5964 stack
.safe_push (node
);
5969 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5973 slp_scc_info
*child_info
= scc_info
.get (child
);
5976 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
5977 /* Recursion might have re-allocated the node. */
5978 info
= scc_info
.get (node
);
5979 child_info
= scc_info
.get (child
);
5980 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
5982 else if (child_info
->on_stack
)
5983 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
5985 if (info
->lowlink
!= info
->dfs
)
5988 auto_vec
<slp_tree
, 4> phis_to_fixup
;
5991 if (stack
.last () == node
)
5994 info
->on_stack
= false;
5995 vect_schedule_slp_node (vinfo
, node
, instance
);
5996 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
5997 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
5998 phis_to_fixup
.quick_push (node
);
6003 int last_idx
= stack
.length () - 1;
6004 while (stack
[last_idx
] != node
)
6006 /* We can break the cycle at PHIs who have at least one child
6007 code generated. Then we could re-start the DFS walk until
6008 all nodes in the SCC are covered (we might have new entries
6009 for only back-reachable nodes). But it's simpler to just
6010 iterate and schedule those that are ready. */
6011 unsigned todo
= stack
.length () - last_idx
;
6014 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
6016 slp_tree entry
= stack
[idx
];
6019 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
6020 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
6022 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
6029 else if (scc_info
.get (child
)->on_stack
)
6047 vect_schedule_slp_node (vinfo
, entry
, instance
);
6048 scc_info
.get (entry
)->on_stack
= false;
6052 phis_to_fixup
.safe_push (entry
);
6059 stack
.truncate (last_idx
);
6062 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
6064 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
6066 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
6069 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
6071 unsigned dest_idx
= e
->dest_idx
;
6072 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
6073 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
6075 /* Simply fill all args. */
6076 for (unsigned i
= 0; i
< SLP_TREE_VEC_STMTS (phi_node
).length (); ++i
)
6077 add_phi_arg (as_a
<gphi
*> (SLP_TREE_VEC_STMTS (phi_node
)[i
]),
6078 vect_get_slp_vect_def (child
, i
),
6079 e
, gimple_phi_arg_location (phi
, dest_idx
));
6084 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
6087 vect_schedule_slp (vec_info
*vinfo
, vec
<slp_instance
> slp_instances
)
6089 slp_instance instance
;
6092 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
6094 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6096 slp_tree node
= SLP_INSTANCE_TREE (instance
);
6097 if (dump_enabled_p ())
6099 dump_printf_loc (MSG_NOTE
, vect_location
,
6100 "Vectorizing SLP tree:\n");
6101 if (SLP_INSTANCE_ROOT_STMT (instance
))
6102 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
6103 SLP_INSTANCE_ROOT_STMT (instance
)->stmt
);
6104 vect_print_slp_graph (MSG_NOTE
, vect_location
,
6105 SLP_INSTANCE_TREE (instance
));
6107 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
6108 have a PHI be the node breaking the cycle. */
6109 auto_vec
<slp_tree
> stack
;
6110 if (!scc_info
.get (node
))
6111 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
6113 if (SLP_INSTANCE_ROOT_STMT (instance
))
6114 vectorize_slp_instance_root_stmt (node
, instance
);
6116 if (dump_enabled_p ())
6117 dump_printf_loc (MSG_NOTE
, vect_location
,
6118 "vectorizing stmts using SLP.\n");
6121 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
6123 slp_tree root
= SLP_INSTANCE_TREE (instance
);
6124 stmt_vec_info store_info
;
6127 /* Remove scalar call stmts. Do not do this for basic-block
6128 vectorization as not all uses may be vectorized.
6129 ??? Why should this be necessary? DCE should be able to
6130 remove the stmts itself.
6131 ??? For BB vectorization we can as well remove scalar
6132 stmts starting from the SLP tree root if they have no
6134 if (is_a
<loop_vec_info
> (vinfo
))
6135 vect_remove_slp_scalar_calls (vinfo
, root
);
6137 /* Remove vectorized stores original scalar stmts. */
6138 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
6140 if (!STMT_VINFO_DATA_REF (store_info
)
6141 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
6144 store_info
= vect_orig_stmt (store_info
);
6145 /* Free the attached stmt_vec_info and remove the stmt. */
6146 vinfo
->remove_stmt (store_info
);