]> gcc.gnu.org Git - gcc.git/blob - gcc/config/aarch64/aarch64.c
1df8d620527dea4f4227d226b51557d69ba6132d
[gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80
81 /* Classifies an address.
82
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
85
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
88
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
91
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
94
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
97
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
100
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
103
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
112 };
113
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
121 };
122
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
125 {
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
128
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
135
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
138
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
142
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
145
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
148
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
153 };
154
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
161 {}
162
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
173 {}
174
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
181 {}
182
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
185
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
188
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
193
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
210
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
213
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
216
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
219
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
222
223 /* Support for command line parsing of boolean flags in the tuning
224 structures. */
225 struct aarch64_flag_desc
226 {
227 const char* name;
228 unsigned int flag;
229 };
230
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232 { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
234 {
235 { "none", AARCH64_FUSE_NOTHING },
236 #include "aarch64-fusion-pairs.def"
237 { "all", AARCH64_FUSE_ALL },
238 { NULL, AARCH64_FUSE_NOTHING }
239 };
240
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242 { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
244 {
245 { "none", AARCH64_EXTRA_TUNE_NONE },
246 #include "aarch64-tuning-flags.def"
247 { "all", AARCH64_EXTRA_TUNE_ALL },
248 { NULL, AARCH64_EXTRA_TUNE_NONE }
249 };
250
251 /* Tuning parameters. */
252
253 static const struct cpu_addrcost_table generic_addrcost_table =
254 {
255 {
256 1, /* hi */
257 0, /* si */
258 0, /* di */
259 1, /* ti */
260 },
261 0, /* pre_modify */
262 0, /* post_modify */
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
266 0 /* imm_offset */
267 };
268
269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
270 {
271 {
272 0, /* hi */
273 0, /* si */
274 0, /* di */
275 2, /* ti */
276 },
277 0, /* pre_modify */
278 0, /* post_modify */
279 1, /* register_offset */
280 1, /* register_sextend */
281 2, /* register_zextend */
282 0, /* imm_offset */
283 };
284
285 static const struct cpu_addrcost_table xgene1_addrcost_table =
286 {
287 {
288 1, /* hi */
289 0, /* si */
290 0, /* di */
291 1, /* ti */
292 },
293 1, /* pre_modify */
294 0, /* post_modify */
295 0, /* register_offset */
296 1, /* register_sextend */
297 1, /* register_zextend */
298 0, /* imm_offset */
299 };
300
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
302 {
303 {
304 1, /* hi */
305 1, /* si */
306 1, /* di */
307 2, /* ti */
308 },
309 0, /* pre_modify */
310 0, /* post_modify */
311 2, /* register_offset */
312 3, /* register_sextend */
313 3, /* register_zextend */
314 0, /* imm_offset */
315 };
316
317 static const struct cpu_regmove_cost generic_regmove_cost =
318 {
319 1, /* GP2GP */
320 /* Avoid the use of slow int<->fp moves for spilling by setting
321 their cost higher than memmov_cost. */
322 5, /* GP2FP */
323 5, /* FP2GP */
324 2 /* FP2FP */
325 };
326
327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
328 {
329 1, /* GP2GP */
330 /* Avoid the use of slow int<->fp moves for spilling by setting
331 their cost higher than memmov_cost. */
332 5, /* GP2FP */
333 5, /* FP2GP */
334 2 /* FP2FP */
335 };
336
337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
338 {
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 5, /* GP2FP */
343 5, /* FP2GP */
344 2 /* FP2FP */
345 };
346
347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
348 {
349 1, /* GP2GP */
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost (actual, 4 and 9). */
352 9, /* GP2FP */
353 9, /* FP2GP */
354 1 /* FP2FP */
355 };
356
357 static const struct cpu_regmove_cost thunderx_regmove_cost =
358 {
359 2, /* GP2GP */
360 2, /* GP2FP */
361 6, /* FP2GP */
362 4 /* FP2FP */
363 };
364
365 static const struct cpu_regmove_cost xgene1_regmove_cost =
366 {
367 1, /* GP2GP */
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
370 8, /* GP2FP */
371 8, /* FP2GP */
372 2 /* FP2FP */
373 };
374
375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
376 {
377 2, /* GP2GP */
378 /* Avoid the use of int<->fp moves for spilling. */
379 6, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
382 };
383
384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
385 {
386 1, /* GP2GP */
387 /* Avoid the use of int<->fp moves for spilling. */
388 8, /* GP2FP */
389 8, /* FP2GP */
390 4 /* FP2FP */
391 };
392
393 /* Generic costs for vector insn classes. */
394 static const struct cpu_vector_cost generic_vector_cost =
395 {
396 1, /* scalar_int_stmt_cost */
397 1, /* scalar_fp_stmt_cost */
398 1, /* scalar_load_cost */
399 1, /* scalar_store_cost */
400 1, /* vec_int_stmt_cost */
401 1, /* vec_fp_stmt_cost */
402 2, /* vec_permute_cost */
403 1, /* vec_to_scalar_cost */
404 1, /* scalar_to_vec_cost */
405 1, /* vec_align_load_cost */
406 1, /* vec_unalign_load_cost */
407 1, /* vec_unalign_store_cost */
408 1, /* vec_store_cost */
409 3, /* cond_taken_branch_cost */
410 1 /* cond_not_taken_branch_cost */
411 };
412
413 /* ThunderX costs for vector insn classes. */
414 static const struct cpu_vector_cost thunderx_vector_cost =
415 {
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 3, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 4, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 4, /* vec_permute_cost */
423 2, /* vec_to_scalar_cost */
424 2, /* scalar_to_vec_cost */
425 3, /* vec_align_load_cost */
426 5, /* vec_unalign_load_cost */
427 5, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 3 /* cond_not_taken_branch_cost */
431 };
432
433 /* Generic costs for vector insn classes. */
434 static const struct cpu_vector_cost cortexa57_vector_cost =
435 {
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 4, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 2, /* vec_int_stmt_cost */
441 2, /* vec_fp_stmt_cost */
442 3, /* vec_permute_cost */
443 8, /* vec_to_scalar_cost */
444 8, /* scalar_to_vec_cost */
445 4, /* vec_align_load_cost */
446 4, /* vec_unalign_load_cost */
447 1, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 1, /* cond_taken_branch_cost */
450 1 /* cond_not_taken_branch_cost */
451 };
452
453 static const struct cpu_vector_cost exynosm1_vector_cost =
454 {
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 5, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 3, /* vec_int_stmt_cost */
460 3, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 3, /* vec_to_scalar_cost */
463 3, /* scalar_to_vec_cost */
464 5, /* vec_align_load_cost */
465 5, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
470 };
471
472 /* Generic costs for vector insn classes. */
473 static const struct cpu_vector_cost xgene1_vector_cost =
474 {
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 2, /* vec_int_stmt_cost */
480 2, /* vec_fp_stmt_cost */
481 2, /* vec_permute_cost */
482 4, /* vec_to_scalar_cost */
483 4, /* scalar_to_vec_cost */
484 10, /* vec_align_load_cost */
485 10, /* vec_unalign_load_cost */
486 2, /* vec_unalign_store_cost */
487 2, /* vec_store_cost */
488 2, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
490 };
491
492 /* Costs for vector insn classes for Vulcan. */
493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
494 {
495 1, /* scalar_int_stmt_cost */
496 6, /* scalar_fp_stmt_cost */
497 4, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 5, /* vec_int_stmt_cost */
500 6, /* vec_fp_stmt_cost */
501 3, /* vec_permute_cost */
502 6, /* vec_to_scalar_cost */
503 5, /* scalar_to_vec_cost */
504 8, /* vec_align_load_cost */
505 8, /* vec_unalign_load_cost */
506 4, /* vec_unalign_store_cost */
507 4, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
510 };
511
512 /* Generic costs for branch instructions. */
513 static const struct cpu_branch_cost generic_branch_cost =
514 {
515 1, /* Predictable. */
516 3 /* Unpredictable. */
517 };
518
519 /* Generic approximation modes. */
520 static const cpu_approx_modes generic_approx_modes =
521 {
522 AARCH64_APPROX_NONE, /* division */
523 AARCH64_APPROX_NONE, /* sqrt */
524 AARCH64_APPROX_NONE /* recip_sqrt */
525 };
526
527 /* Approximation modes for Exynos M1. */
528 static const cpu_approx_modes exynosm1_approx_modes =
529 {
530 AARCH64_APPROX_NONE, /* division */
531 AARCH64_APPROX_ALL, /* sqrt */
532 AARCH64_APPROX_ALL /* recip_sqrt */
533 };
534
535 /* Approximation modes for X-Gene 1. */
536 static const cpu_approx_modes xgene1_approx_modes =
537 {
538 AARCH64_APPROX_NONE, /* division */
539 AARCH64_APPROX_NONE, /* sqrt */
540 AARCH64_APPROX_ALL /* recip_sqrt */
541 };
542
543 /* Generic prefetch settings (which disable prefetch). */
544 static const cpu_prefetch_tune generic_prefetch_tune =
545 {
546 0, /* num_slots */
547 -1, /* l1_cache_size */
548 -1, /* l1_cache_line_size */
549 -1, /* l2_cache_size */
550 -1 /* default_opt_level */
551 };
552
553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
554 {
555 0, /* num_slots */
556 -1, /* l1_cache_size */
557 64, /* l1_cache_line_size */
558 -1, /* l2_cache_size */
559 -1 /* default_opt_level */
560 };
561
562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
563 {
564 4, /* num_slots */
565 32, /* l1_cache_size */
566 64, /* l1_cache_line_size */
567 512, /* l2_cache_size */
568 -1 /* default_opt_level */
569 };
570
571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
572 {
573 8, /* num_slots */
574 32, /* l1_cache_size */
575 128, /* l1_cache_line_size */
576 16*1024, /* l2_cache_size */
577 3 /* default_opt_level */
578 };
579
580 static const cpu_prefetch_tune thunderx_prefetch_tune =
581 {
582 8, /* num_slots */
583 32, /* l1_cache_size */
584 128, /* l1_cache_line_size */
585 -1, /* l2_cache_size */
586 -1 /* default_opt_level */
587 };
588
589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
590 {
591 8, /* num_slots */
592 32, /* l1_cache_size */
593 64, /* l1_cache_line_size */
594 256, /* l2_cache_size */
595 -1 /* default_opt_level */
596 };
597
598 static const struct tune_params generic_tunings =
599 {
600 &cortexa57_extra_costs,
601 &generic_addrcost_table,
602 &generic_regmove_cost,
603 &generic_vector_cost,
604 &generic_branch_cost,
605 &generic_approx_modes,
606 4, /* memmov_cost */
607 2, /* issue_rate */
608 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
609 8, /* function_align. */
610 4, /* jump_align. */
611 8, /* loop_align. */
612 2, /* int_reassoc_width. */
613 4, /* fp_reassoc_width. */
614 1, /* vec_reassoc_width. */
615 2, /* min_div_recip_mul_sf. */
616 2, /* min_div_recip_mul_df. */
617 0, /* max_case_values. */
618 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
619 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
620 &generic_prefetch_tune
621 };
622
623 static const struct tune_params cortexa35_tunings =
624 {
625 &cortexa53_extra_costs,
626 &generic_addrcost_table,
627 &cortexa53_regmove_cost,
628 &generic_vector_cost,
629 &generic_branch_cost,
630 &generic_approx_modes,
631 4, /* memmov_cost */
632 1, /* issue_rate */
633 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
634 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
635 16, /* function_align. */
636 4, /* jump_align. */
637 8, /* loop_align. */
638 2, /* int_reassoc_width. */
639 4, /* fp_reassoc_width. */
640 1, /* vec_reassoc_width. */
641 2, /* min_div_recip_mul_sf. */
642 2, /* min_div_recip_mul_df. */
643 0, /* max_case_values. */
644 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
645 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
646 &generic_prefetch_tune
647 };
648
649 static const struct tune_params cortexa53_tunings =
650 {
651 &cortexa53_extra_costs,
652 &generic_addrcost_table,
653 &cortexa53_regmove_cost,
654 &generic_vector_cost,
655 &generic_branch_cost,
656 &generic_approx_modes,
657 4, /* memmov_cost */
658 2, /* issue_rate */
659 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
660 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
661 16, /* function_align. */
662 4, /* jump_align. */
663 8, /* loop_align. */
664 2, /* int_reassoc_width. */
665 4, /* fp_reassoc_width. */
666 1, /* vec_reassoc_width. */
667 2, /* min_div_recip_mul_sf. */
668 2, /* min_div_recip_mul_df. */
669 0, /* max_case_values. */
670 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
671 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
672 &generic_prefetch_tune
673 };
674
675 static const struct tune_params cortexa57_tunings =
676 {
677 &cortexa57_extra_costs,
678 &generic_addrcost_table,
679 &cortexa57_regmove_cost,
680 &cortexa57_vector_cost,
681 &generic_branch_cost,
682 &generic_approx_modes,
683 4, /* memmov_cost */
684 3, /* issue_rate */
685 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
686 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
687 16, /* function_align. */
688 4, /* jump_align. */
689 8, /* loop_align. */
690 2, /* int_reassoc_width. */
691 4, /* fp_reassoc_width. */
692 1, /* vec_reassoc_width. */
693 2, /* min_div_recip_mul_sf. */
694 2, /* min_div_recip_mul_df. */
695 0, /* max_case_values. */
696 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
697 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
698 &generic_prefetch_tune
699 };
700
701 static const struct tune_params cortexa72_tunings =
702 {
703 &cortexa57_extra_costs,
704 &generic_addrcost_table,
705 &cortexa57_regmove_cost,
706 &cortexa57_vector_cost,
707 &generic_branch_cost,
708 &generic_approx_modes,
709 4, /* memmov_cost */
710 3, /* issue_rate */
711 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
712 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
713 16, /* function_align. */
714 4, /* jump_align. */
715 8, /* loop_align. */
716 2, /* int_reassoc_width. */
717 4, /* fp_reassoc_width. */
718 1, /* vec_reassoc_width. */
719 2, /* min_div_recip_mul_sf. */
720 2, /* min_div_recip_mul_df. */
721 0, /* max_case_values. */
722 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
723 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
724 &generic_prefetch_tune
725 };
726
727 static const struct tune_params cortexa73_tunings =
728 {
729 &cortexa57_extra_costs,
730 &generic_addrcost_table,
731 &cortexa57_regmove_cost,
732 &cortexa57_vector_cost,
733 &generic_branch_cost,
734 &generic_approx_modes,
735 4, /* memmov_cost. */
736 2, /* issue_rate. */
737 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
738 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
739 16, /* function_align. */
740 4, /* jump_align. */
741 8, /* loop_align. */
742 2, /* int_reassoc_width. */
743 4, /* fp_reassoc_width. */
744 1, /* vec_reassoc_width. */
745 2, /* min_div_recip_mul_sf. */
746 2, /* min_div_recip_mul_df. */
747 0, /* max_case_values. */
748 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
749 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
750 &generic_prefetch_tune
751 };
752
753
754
755 static const struct tune_params exynosm1_tunings =
756 {
757 &exynosm1_extra_costs,
758 &exynosm1_addrcost_table,
759 &exynosm1_regmove_cost,
760 &exynosm1_vector_cost,
761 &generic_branch_cost,
762 &exynosm1_approx_modes,
763 4, /* memmov_cost */
764 3, /* issue_rate */
765 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
766 4, /* function_align. */
767 4, /* jump_align. */
768 4, /* loop_align. */
769 2, /* int_reassoc_width. */
770 4, /* fp_reassoc_width. */
771 1, /* vec_reassoc_width. */
772 2, /* min_div_recip_mul_sf. */
773 2, /* min_div_recip_mul_df. */
774 48, /* max_case_values. */
775 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
776 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
777 &exynosm1_prefetch_tune
778 };
779
780 static const struct tune_params thunderxt88_tunings =
781 {
782 &thunderx_extra_costs,
783 &generic_addrcost_table,
784 &thunderx_regmove_cost,
785 &thunderx_vector_cost,
786 &generic_branch_cost,
787 &generic_approx_modes,
788 6, /* memmov_cost */
789 2, /* issue_rate */
790 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
791 8, /* function_align. */
792 8, /* jump_align. */
793 8, /* loop_align. */
794 2, /* int_reassoc_width. */
795 4, /* fp_reassoc_width. */
796 1, /* vec_reassoc_width. */
797 2, /* min_div_recip_mul_sf. */
798 2, /* min_div_recip_mul_df. */
799 0, /* max_case_values. */
800 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
801 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
802 &thunderxt88_prefetch_tune
803 };
804
805 static const struct tune_params thunderx_tunings =
806 {
807 &thunderx_extra_costs,
808 &generic_addrcost_table,
809 &thunderx_regmove_cost,
810 &thunderx_vector_cost,
811 &generic_branch_cost,
812 &generic_approx_modes,
813 6, /* memmov_cost */
814 2, /* issue_rate */
815 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
816 8, /* function_align. */
817 8, /* jump_align. */
818 8, /* loop_align. */
819 2, /* int_reassoc_width. */
820 4, /* fp_reassoc_width. */
821 1, /* vec_reassoc_width. */
822 2, /* min_div_recip_mul_sf. */
823 2, /* min_div_recip_mul_df. */
824 0, /* max_case_values. */
825 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
826 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
827 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
828 &thunderx_prefetch_tune
829 };
830
831 static const struct tune_params xgene1_tunings =
832 {
833 &xgene1_extra_costs,
834 &xgene1_addrcost_table,
835 &xgene1_regmove_cost,
836 &xgene1_vector_cost,
837 &generic_branch_cost,
838 &xgene1_approx_modes,
839 6, /* memmov_cost */
840 4, /* issue_rate */
841 AARCH64_FUSE_NOTHING, /* fusible_ops */
842 16, /* function_align. */
843 8, /* jump_align. */
844 16, /* loop_align. */
845 2, /* int_reassoc_width. */
846 4, /* fp_reassoc_width. */
847 1, /* vec_reassoc_width. */
848 2, /* min_div_recip_mul_sf. */
849 2, /* min_div_recip_mul_df. */
850 0, /* max_case_values. */
851 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
852 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
853 &generic_prefetch_tune
854 };
855
856 static const struct tune_params qdf24xx_tunings =
857 {
858 &qdf24xx_extra_costs,
859 &generic_addrcost_table,
860 &qdf24xx_regmove_cost,
861 &generic_vector_cost,
862 &generic_branch_cost,
863 &generic_approx_modes,
864 4, /* memmov_cost */
865 4, /* issue_rate */
866 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
867 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
868 16, /* function_align. */
869 8, /* jump_align. */
870 16, /* loop_align. */
871 2, /* int_reassoc_width. */
872 4, /* fp_reassoc_width. */
873 1, /* vec_reassoc_width. */
874 2, /* min_div_recip_mul_sf. */
875 2, /* min_div_recip_mul_df. */
876 0, /* max_case_values. */
877 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
878 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
879 &qdf24xx_prefetch_tune
880 };
881
882 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
883 for now. */
884 static const struct tune_params saphira_tunings =
885 {
886 &generic_extra_costs,
887 &generic_addrcost_table,
888 &generic_regmove_cost,
889 &generic_vector_cost,
890 &generic_branch_cost,
891 &generic_approx_modes,
892 4, /* memmov_cost */
893 4, /* issue_rate */
894 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
895 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
896 16, /* function_align. */
897 8, /* jump_align. */
898 16, /* loop_align. */
899 2, /* int_reassoc_width. */
900 4, /* fp_reassoc_width. */
901 1, /* vec_reassoc_width. */
902 2, /* min_div_recip_mul_sf. */
903 2, /* min_div_recip_mul_df. */
904 0, /* max_case_values. */
905 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
906 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
907 &generic_prefetch_tune
908 };
909
910 static const struct tune_params thunderx2t99_tunings =
911 {
912 &thunderx2t99_extra_costs,
913 &thunderx2t99_addrcost_table,
914 &thunderx2t99_regmove_cost,
915 &thunderx2t99_vector_cost,
916 &generic_branch_cost,
917 &generic_approx_modes,
918 4, /* memmov_cost. */
919 4, /* issue_rate. */
920 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
921 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
922 16, /* function_align. */
923 8, /* jump_align. */
924 16, /* loop_align. */
925 3, /* int_reassoc_width. */
926 2, /* fp_reassoc_width. */
927 2, /* vec_reassoc_width. */
928 2, /* min_div_recip_mul_sf. */
929 2, /* min_div_recip_mul_df. */
930 0, /* max_case_values. */
931 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
932 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
933 &thunderx2t99_prefetch_tune
934 };
935
936 /* Support for fine-grained override of the tuning structures. */
937 struct aarch64_tuning_override_function
938 {
939 const char* name;
940 void (*parse_override)(const char*, struct tune_params*);
941 };
942
943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
945
946 static const struct aarch64_tuning_override_function
947 aarch64_tuning_override_functions[] =
948 {
949 { "fuse", aarch64_parse_fuse_string },
950 { "tune", aarch64_parse_tune_string },
951 { NULL, NULL }
952 };
953
954 /* A processor implementing AArch64. */
955 struct processor
956 {
957 const char *const name;
958 enum aarch64_processor ident;
959 enum aarch64_processor sched_core;
960 enum aarch64_arch arch;
961 unsigned architecture_version;
962 const unsigned long flags;
963 const struct tune_params *const tune;
964 };
965
966 /* Architectures implementing AArch64. */
967 static const struct processor all_architectures[] =
968 {
969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
970 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
971 #include "aarch64-arches.def"
972 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
973 };
974
975 /* Processor cores implementing AArch64. */
976 static const struct processor all_cores[] =
977 {
978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
979 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
980 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
981 FLAGS, &COSTS##_tunings},
982 #include "aarch64-cores.def"
983 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
984 AARCH64_FL_FOR_ARCH8, &generic_tunings},
985 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
986 };
987
988
989 /* Target specification. These are populated by the -march, -mtune, -mcpu
990 handling code or by target attributes. */
991 static const struct processor *selected_arch;
992 static const struct processor *selected_cpu;
993 static const struct processor *selected_tune;
994
995 /* The current tuning set. */
996 struct tune_params aarch64_tune_params = generic_tunings;
997
998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
999
1000 /* An ISA extension in the co-processor and main instruction set space. */
1001 struct aarch64_option_extension
1002 {
1003 const char *const name;
1004 const unsigned long flags_on;
1005 const unsigned long flags_off;
1006 };
1007
1008 typedef enum aarch64_cond_code
1009 {
1010 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1013 }
1014 aarch64_cc;
1015
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1017
1018 /* The condition codes of the processor, and the inverse function. */
1019 static const char * const aarch64_condition_codes[] =
1020 {
1021 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1023 };
1024
1025 /* Generate code to enable conditional branches in functions over 1 MiB. */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028 const char * branch_format)
1029 {
1030 rtx_code_label * tmp_label = gen_label_rtx ();
1031 char label_buf[256];
1032 char buffer[128];
1033 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034 CODE_LABEL_NUMBER (tmp_label));
1035 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036 rtx dest_label = operands[pos_label];
1037 operands[pos_label] = tmp_label;
1038
1039 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040 output_asm_insn (buffer, operands);
1041
1042 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043 operands[pos_label] = dest_label;
1044 output_asm_insn (buffer, operands);
1045 return "";
1046 }
1047
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1050 {
1051 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052 if (TARGET_GENERAL_REGS_ONLY)
1053 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054 else
1055 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1056 }
1057
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1061 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1063 irrespectively of its cost results in bad allocations with many redundant
1064 int<->FP moves which are expensive on various cores.
1065 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1067 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1068 Otherwise set the allocno class depending on the mode.
1069 The result of this is that it is no longer inefficient to have a higher
1070 memory move cost than the register move cost.
1071 */
1072
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075 reg_class_t best_class)
1076 {
1077 machine_mode mode;
1078
1079 if (allocno_class != ALL_REGS)
1080 return allocno_class;
1081
1082 if (best_class != ALL_REGS)
1083 return best_class;
1084
1085 mode = PSEUDO_REGNO_MODE (regno);
1086 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1087 }
1088
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1091 {
1092 if (GET_MODE_UNIT_SIZE (mode) == 4)
1093 return aarch64_tune_params.min_div_recip_mul_sf;
1094 return aarch64_tune_params.min_div_recip_mul_df;
1095 }
1096
1097 /* Return the reassociation width of treeop OPC with mode MODE. */
1098 static int
1099 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1100 {
1101 if (VECTOR_MODE_P (mode))
1102 return aarch64_tune_params.vec_reassoc_width;
1103 if (INTEGRAL_MODE_P (mode))
1104 return aarch64_tune_params.int_reassoc_width;
1105 /* Avoid reassociating floating point addition so we emit more FMAs. */
1106 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1107 return aarch64_tune_params.fp_reassoc_width;
1108 return 1;
1109 }
1110
1111 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1112 unsigned
1113 aarch64_dbx_register_number (unsigned regno)
1114 {
1115 if (GP_REGNUM_P (regno))
1116 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1117 else if (regno == SP_REGNUM)
1118 return AARCH64_DWARF_SP;
1119 else if (FP_REGNUM_P (regno))
1120 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1121 else if (PR_REGNUM_P (regno))
1122 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1123 else if (regno == VG_REGNUM)
1124 return AARCH64_DWARF_VG;
1125
1126 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1127 equivalent DWARF register. */
1128 return DWARF_FRAME_REGISTERS;
1129 }
1130
1131 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1132 static bool
1133 aarch64_advsimd_struct_mode_p (machine_mode mode)
1134 {
1135 return (TARGET_SIMD
1136 && (mode == OImode || mode == CImode || mode == XImode));
1137 }
1138
1139 /* Return true if MODE is an SVE predicate mode. */
1140 static bool
1141 aarch64_sve_pred_mode_p (machine_mode mode)
1142 {
1143 return (TARGET_SVE
1144 && (mode == VNx16BImode
1145 || mode == VNx8BImode
1146 || mode == VNx4BImode
1147 || mode == VNx2BImode));
1148 }
1149
1150 /* Three mutually-exclusive flags describing a vector or predicate type. */
1151 const unsigned int VEC_ADVSIMD = 1;
1152 const unsigned int VEC_SVE_DATA = 2;
1153 const unsigned int VEC_SVE_PRED = 4;
1154 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1155 a structure of 2, 3 or 4 vectors. */
1156 const unsigned int VEC_STRUCT = 8;
1157 /* Useful combinations of the above. */
1158 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1159 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1160
1161 /* Return a set of flags describing the vector properties of mode MODE.
1162 Ignore modes that are not supported by the current target. */
1163 static unsigned int
1164 aarch64_classify_vector_mode (machine_mode mode)
1165 {
1166 if (aarch64_advsimd_struct_mode_p (mode))
1167 return VEC_ADVSIMD | VEC_STRUCT;
1168
1169 if (aarch64_sve_pred_mode_p (mode))
1170 return VEC_SVE_PRED;
1171
1172 scalar_mode inner = GET_MODE_INNER (mode);
1173 if (VECTOR_MODE_P (mode)
1174 && (inner == QImode
1175 || inner == HImode
1176 || inner == HFmode
1177 || inner == SImode
1178 || inner == SFmode
1179 || inner == DImode
1180 || inner == DFmode))
1181 {
1182 if (TARGET_SVE)
1183 {
1184 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1185 return VEC_SVE_DATA;
1186 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1187 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1188 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1189 return VEC_SVE_DATA | VEC_STRUCT;
1190 }
1191
1192 /* This includes V1DF but not V1DI (which doesn't exist). */
1193 if (TARGET_SIMD
1194 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1195 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1196 return VEC_ADVSIMD;
1197 }
1198
1199 return 0;
1200 }
1201
1202 /* Return true if MODE is any of the data vector modes, including
1203 structure modes. */
1204 static bool
1205 aarch64_vector_data_mode_p (machine_mode mode)
1206 {
1207 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1208 }
1209
1210 /* Return true if MODE is an SVE data vector mode; either a single vector
1211 or a structure of vectors. */
1212 static bool
1213 aarch64_sve_data_mode_p (machine_mode mode)
1214 {
1215 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1216 }
1217
1218 /* Implement target hook TARGET_ARRAY_MODE. */
1219 static opt_machine_mode
1220 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1221 {
1222 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1223 && IN_RANGE (nelems, 2, 4))
1224 return mode_for_vector (GET_MODE_INNER (mode),
1225 GET_MODE_NUNITS (mode) * nelems);
1226
1227 return opt_machine_mode ();
1228 }
1229
1230 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1231 static bool
1232 aarch64_array_mode_supported_p (machine_mode mode,
1233 unsigned HOST_WIDE_INT nelems)
1234 {
1235 if (TARGET_SIMD
1236 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1237 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1238 && (nelems >= 2 && nelems <= 4))
1239 return true;
1240
1241 return false;
1242 }
1243
1244 /* Return the SVE predicate mode to use for elements that have
1245 ELEM_NBYTES bytes, if such a mode exists. */
1246
1247 opt_machine_mode
1248 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1249 {
1250 if (TARGET_SVE)
1251 {
1252 if (elem_nbytes == 1)
1253 return VNx16BImode;
1254 if (elem_nbytes == 2)
1255 return VNx8BImode;
1256 if (elem_nbytes == 4)
1257 return VNx4BImode;
1258 if (elem_nbytes == 8)
1259 return VNx2BImode;
1260 }
1261 return opt_machine_mode ();
1262 }
1263
1264 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1265
1266 static opt_machine_mode
1267 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1268 {
1269 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1270 {
1271 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1272 machine_mode pred_mode;
1273 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1274 return pred_mode;
1275 }
1276
1277 return default_get_mask_mode (nunits, nbytes);
1278 }
1279
1280 /* Implement TARGET_HARD_REGNO_NREGS. */
1281
1282 static unsigned int
1283 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1284 {
1285 /* ??? Logically we should only need to provide a value when
1286 HARD_REGNO_MODE_OK says that the combination is valid,
1287 but at the moment we need to handle all modes. Just ignore
1288 any runtime parts for registers that can't store them. */
1289 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1290 switch (aarch64_regno_regclass (regno))
1291 {
1292 case FP_REGS:
1293 case FP_LO_REGS:
1294 if (aarch64_sve_data_mode_p (mode))
1295 return exact_div (GET_MODE_SIZE (mode),
1296 BYTES_PER_SVE_VECTOR).to_constant ();
1297 return CEIL (lowest_size, UNITS_PER_VREG);
1298 case PR_REGS:
1299 case PR_LO_REGS:
1300 case PR_HI_REGS:
1301 return 1;
1302 default:
1303 return CEIL (lowest_size, UNITS_PER_WORD);
1304 }
1305 gcc_unreachable ();
1306 }
1307
1308 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1309
1310 static bool
1311 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1312 {
1313 if (GET_MODE_CLASS (mode) == MODE_CC)
1314 return regno == CC_REGNUM;
1315
1316 if (regno == VG_REGNUM)
1317 /* This must have the same size as _Unwind_Word. */
1318 return mode == DImode;
1319
1320 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1321 if (vec_flags & VEC_SVE_PRED)
1322 return PR_REGNUM_P (regno);
1323
1324 if (PR_REGNUM_P (regno))
1325 return 0;
1326
1327 if (regno == SP_REGNUM)
1328 /* The purpose of comparing with ptr_mode is to support the
1329 global register variable associated with the stack pointer
1330 register via the syntax of asm ("wsp") in ILP32. */
1331 return mode == Pmode || mode == ptr_mode;
1332
1333 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1334 return mode == Pmode;
1335
1336 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1337 return true;
1338
1339 if (FP_REGNUM_P (regno))
1340 {
1341 if (vec_flags & VEC_STRUCT)
1342 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1343 else
1344 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1345 }
1346
1347 return false;
1348 }
1349
1350 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1351 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1352 clobbers the top 64 bits when restoring the bottom 64 bits. */
1353
1354 static bool
1355 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1356 {
1357 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1358 }
1359
1360 /* Implement REGMODE_NATURAL_SIZE. */
1361 poly_uint64
1362 aarch64_regmode_natural_size (machine_mode mode)
1363 {
1364 /* The natural size for SVE data modes is one SVE data vector,
1365 and similarly for predicates. We can't independently modify
1366 anything smaller than that. */
1367 /* ??? For now, only do this for variable-width SVE registers.
1368 Doing it for constant-sized registers breaks lower-subreg.c. */
1369 /* ??? And once that's fixed, we should probably have similar
1370 code for Advanced SIMD. */
1371 if (!aarch64_sve_vg.is_constant ())
1372 {
1373 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1374 if (vec_flags & VEC_SVE_PRED)
1375 return BYTES_PER_SVE_PRED;
1376 if (vec_flags & VEC_SVE_DATA)
1377 return BYTES_PER_SVE_VECTOR;
1378 }
1379 return UNITS_PER_WORD;
1380 }
1381
1382 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1383 machine_mode
1384 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1385 machine_mode mode)
1386 {
1387 /* The predicate mode determines which bits are significant and
1388 which are "don't care". Decreasing the number of lanes would
1389 lose data while increasing the number of lanes would make bits
1390 unnecessarily significant. */
1391 if (PR_REGNUM_P (regno))
1392 return mode;
1393 if (known_ge (GET_MODE_SIZE (mode), 4))
1394 return mode;
1395 else
1396 return SImode;
1397 }
1398
1399 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1400 that strcpy from constants will be faster. */
1401
1402 static HOST_WIDE_INT
1403 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1404 {
1405 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1406 return MAX (align, BITS_PER_WORD);
1407 return align;
1408 }
1409
1410 /* Return true if calls to DECL should be treated as
1411 long-calls (ie called via a register). */
1412 static bool
1413 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1414 {
1415 return false;
1416 }
1417
1418 /* Return true if calls to symbol-ref SYM should be treated as
1419 long-calls (ie called via a register). */
1420 bool
1421 aarch64_is_long_call_p (rtx sym)
1422 {
1423 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1424 }
1425
1426 /* Return true if calls to symbol-ref SYM should not go through
1427 plt stubs. */
1428
1429 bool
1430 aarch64_is_noplt_call_p (rtx sym)
1431 {
1432 const_tree decl = SYMBOL_REF_DECL (sym);
1433
1434 if (flag_pic
1435 && decl
1436 && (!flag_plt
1437 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1438 && !targetm.binds_local_p (decl))
1439 return true;
1440
1441 return false;
1442 }
1443
1444 /* Return true if the offsets to a zero/sign-extract operation
1445 represent an expression that matches an extend operation. The
1446 operands represent the paramters from
1447
1448 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1449 bool
1450 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1451 rtx extract_imm)
1452 {
1453 HOST_WIDE_INT mult_val, extract_val;
1454
1455 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1456 return false;
1457
1458 mult_val = INTVAL (mult_imm);
1459 extract_val = INTVAL (extract_imm);
1460
1461 if (extract_val > 8
1462 && extract_val < GET_MODE_BITSIZE (mode)
1463 && exact_log2 (extract_val & ~7) > 0
1464 && (extract_val & 7) <= 4
1465 && mult_val == (1 << (extract_val & 7)))
1466 return true;
1467
1468 return false;
1469 }
1470
1471 /* Emit an insn that's a simple single-set. Both the operands must be
1472 known to be valid. */
1473 inline static rtx_insn *
1474 emit_set_insn (rtx x, rtx y)
1475 {
1476 return emit_insn (gen_rtx_SET (x, y));
1477 }
1478
1479 /* X and Y are two things to compare using CODE. Emit the compare insn and
1480 return the rtx for register 0 in the proper mode. */
1481 rtx
1482 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1483 {
1484 machine_mode mode = SELECT_CC_MODE (code, x, y);
1485 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1486
1487 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1488 return cc_reg;
1489 }
1490
1491 /* Build the SYMBOL_REF for __tls_get_addr. */
1492
1493 static GTY(()) rtx tls_get_addr_libfunc;
1494
1495 rtx
1496 aarch64_tls_get_addr (void)
1497 {
1498 if (!tls_get_addr_libfunc)
1499 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1500 return tls_get_addr_libfunc;
1501 }
1502
1503 /* Return the TLS model to use for ADDR. */
1504
1505 static enum tls_model
1506 tls_symbolic_operand_type (rtx addr)
1507 {
1508 enum tls_model tls_kind = TLS_MODEL_NONE;
1509 if (GET_CODE (addr) == CONST)
1510 {
1511 poly_int64 addend;
1512 rtx sym = strip_offset (addr, &addend);
1513 if (GET_CODE (sym) == SYMBOL_REF)
1514 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1515 }
1516 else if (GET_CODE (addr) == SYMBOL_REF)
1517 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1518
1519 return tls_kind;
1520 }
1521
1522 /* We'll allow lo_sum's in addresses in our legitimate addresses
1523 so that combine would take care of combining addresses where
1524 necessary, but for generation purposes, we'll generate the address
1525 as :
1526 RTL Absolute
1527 tmp = hi (symbol_ref); adrp x1, foo
1528 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1529 nop
1530
1531 PIC TLS
1532 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1533 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1534 bl __tls_get_addr
1535 nop
1536
1537 Load TLS symbol, depending on TLS mechanism and TLS access model.
1538
1539 Global Dynamic - Traditional TLS:
1540 adrp tmp, :tlsgd:imm
1541 add dest, tmp, #:tlsgd_lo12:imm
1542 bl __tls_get_addr
1543
1544 Global Dynamic - TLS Descriptors:
1545 adrp dest, :tlsdesc:imm
1546 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1547 add dest, dest, #:tlsdesc_lo12:imm
1548 blr tmp
1549 mrs tp, tpidr_el0
1550 add dest, dest, tp
1551
1552 Initial Exec:
1553 mrs tp, tpidr_el0
1554 adrp tmp, :gottprel:imm
1555 ldr dest, [tmp, #:gottprel_lo12:imm]
1556 add dest, dest, tp
1557
1558 Local Exec:
1559 mrs tp, tpidr_el0
1560 add t0, tp, #:tprel_hi12:imm, lsl #12
1561 add t0, t0, #:tprel_lo12_nc:imm
1562 */
1563
1564 static void
1565 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1566 enum aarch64_symbol_type type)
1567 {
1568 switch (type)
1569 {
1570 case SYMBOL_SMALL_ABSOLUTE:
1571 {
1572 /* In ILP32, the mode of dest can be either SImode or DImode. */
1573 rtx tmp_reg = dest;
1574 machine_mode mode = GET_MODE (dest);
1575
1576 gcc_assert (mode == Pmode || mode == ptr_mode);
1577
1578 if (can_create_pseudo_p ())
1579 tmp_reg = gen_reg_rtx (mode);
1580
1581 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1582 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1583 return;
1584 }
1585
1586 case SYMBOL_TINY_ABSOLUTE:
1587 emit_insn (gen_rtx_SET (dest, imm));
1588 return;
1589
1590 case SYMBOL_SMALL_GOT_28K:
1591 {
1592 machine_mode mode = GET_MODE (dest);
1593 rtx gp_rtx = pic_offset_table_rtx;
1594 rtx insn;
1595 rtx mem;
1596
1597 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1598 here before rtl expand. Tree IVOPT will generate rtl pattern to
1599 decide rtx costs, in which case pic_offset_table_rtx is not
1600 initialized. For that case no need to generate the first adrp
1601 instruction as the final cost for global variable access is
1602 one instruction. */
1603 if (gp_rtx != NULL)
1604 {
1605 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1606 using the page base as GOT base, the first page may be wasted,
1607 in the worst scenario, there is only 28K space for GOT).
1608
1609 The generate instruction sequence for accessing global variable
1610 is:
1611
1612 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1613
1614 Only one instruction needed. But we must initialize
1615 pic_offset_table_rtx properly. We generate initialize insn for
1616 every global access, and allow CSE to remove all redundant.
1617
1618 The final instruction sequences will look like the following
1619 for multiply global variables access.
1620
1621 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1622
1623 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1624 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1625 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1626 ... */
1627
1628 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1629 crtl->uses_pic_offset_table = 1;
1630 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1631
1632 if (mode != GET_MODE (gp_rtx))
1633 gp_rtx = gen_lowpart (mode, gp_rtx);
1634
1635 }
1636
1637 if (mode == ptr_mode)
1638 {
1639 if (mode == DImode)
1640 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1641 else
1642 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1643
1644 mem = XVECEXP (SET_SRC (insn), 0, 0);
1645 }
1646 else
1647 {
1648 gcc_assert (mode == Pmode);
1649
1650 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1651 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1652 }
1653
1654 /* The operand is expected to be MEM. Whenever the related insn
1655 pattern changed, above code which calculate mem should be
1656 updated. */
1657 gcc_assert (GET_CODE (mem) == MEM);
1658 MEM_READONLY_P (mem) = 1;
1659 MEM_NOTRAP_P (mem) = 1;
1660 emit_insn (insn);
1661 return;
1662 }
1663
1664 case SYMBOL_SMALL_GOT_4G:
1665 {
1666 /* In ILP32, the mode of dest can be either SImode or DImode,
1667 while the got entry is always of SImode size. The mode of
1668 dest depends on how dest is used: if dest is assigned to a
1669 pointer (e.g. in the memory), it has SImode; it may have
1670 DImode if dest is dereferenced to access the memeory.
1671 This is why we have to handle three different ldr_got_small
1672 patterns here (two patterns for ILP32). */
1673
1674 rtx insn;
1675 rtx mem;
1676 rtx tmp_reg = dest;
1677 machine_mode mode = GET_MODE (dest);
1678
1679 if (can_create_pseudo_p ())
1680 tmp_reg = gen_reg_rtx (mode);
1681
1682 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1683 if (mode == ptr_mode)
1684 {
1685 if (mode == DImode)
1686 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1687 else
1688 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1689
1690 mem = XVECEXP (SET_SRC (insn), 0, 0);
1691 }
1692 else
1693 {
1694 gcc_assert (mode == Pmode);
1695
1696 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1697 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1698 }
1699
1700 gcc_assert (GET_CODE (mem) == MEM);
1701 MEM_READONLY_P (mem) = 1;
1702 MEM_NOTRAP_P (mem) = 1;
1703 emit_insn (insn);
1704 return;
1705 }
1706
1707 case SYMBOL_SMALL_TLSGD:
1708 {
1709 rtx_insn *insns;
1710 machine_mode mode = GET_MODE (dest);
1711 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1712
1713 start_sequence ();
1714 if (TARGET_ILP32)
1715 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1716 else
1717 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1718 insns = get_insns ();
1719 end_sequence ();
1720
1721 RTL_CONST_CALL_P (insns) = 1;
1722 emit_libcall_block (insns, dest, result, imm);
1723 return;
1724 }
1725
1726 case SYMBOL_SMALL_TLSDESC:
1727 {
1728 machine_mode mode = GET_MODE (dest);
1729 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1730 rtx tp;
1731
1732 gcc_assert (mode == Pmode || mode == ptr_mode);
1733
1734 /* In ILP32, the got entry is always of SImode size. Unlike
1735 small GOT, the dest is fixed at reg 0. */
1736 if (TARGET_ILP32)
1737 emit_insn (gen_tlsdesc_small_si (imm));
1738 else
1739 emit_insn (gen_tlsdesc_small_di (imm));
1740 tp = aarch64_load_tp (NULL);
1741
1742 if (mode != Pmode)
1743 tp = gen_lowpart (mode, tp);
1744
1745 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1746 if (REG_P (dest))
1747 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1748 return;
1749 }
1750
1751 case SYMBOL_SMALL_TLSIE:
1752 {
1753 /* In ILP32, the mode of dest can be either SImode or DImode,
1754 while the got entry is always of SImode size. The mode of
1755 dest depends on how dest is used: if dest is assigned to a
1756 pointer (e.g. in the memory), it has SImode; it may have
1757 DImode if dest is dereferenced to access the memeory.
1758 This is why we have to handle three different tlsie_small
1759 patterns here (two patterns for ILP32). */
1760 machine_mode mode = GET_MODE (dest);
1761 rtx tmp_reg = gen_reg_rtx (mode);
1762 rtx tp = aarch64_load_tp (NULL);
1763
1764 if (mode == ptr_mode)
1765 {
1766 if (mode == DImode)
1767 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1768 else
1769 {
1770 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1771 tp = gen_lowpart (mode, tp);
1772 }
1773 }
1774 else
1775 {
1776 gcc_assert (mode == Pmode);
1777 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1778 }
1779
1780 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1781 if (REG_P (dest))
1782 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1783 return;
1784 }
1785
1786 case SYMBOL_TLSLE12:
1787 case SYMBOL_TLSLE24:
1788 case SYMBOL_TLSLE32:
1789 case SYMBOL_TLSLE48:
1790 {
1791 machine_mode mode = GET_MODE (dest);
1792 rtx tp = aarch64_load_tp (NULL);
1793
1794 if (mode != Pmode)
1795 tp = gen_lowpart (mode, tp);
1796
1797 switch (type)
1798 {
1799 case SYMBOL_TLSLE12:
1800 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1801 (dest, tp, imm));
1802 break;
1803 case SYMBOL_TLSLE24:
1804 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1805 (dest, tp, imm));
1806 break;
1807 case SYMBOL_TLSLE32:
1808 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1809 (dest, imm));
1810 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1811 (dest, dest, tp));
1812 break;
1813 case SYMBOL_TLSLE48:
1814 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1815 (dest, imm));
1816 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1817 (dest, dest, tp));
1818 break;
1819 default:
1820 gcc_unreachable ();
1821 }
1822
1823 if (REG_P (dest))
1824 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1825 return;
1826 }
1827
1828 case SYMBOL_TINY_GOT:
1829 emit_insn (gen_ldr_got_tiny (dest, imm));
1830 return;
1831
1832 case SYMBOL_TINY_TLSIE:
1833 {
1834 machine_mode mode = GET_MODE (dest);
1835 rtx tp = aarch64_load_tp (NULL);
1836
1837 if (mode == ptr_mode)
1838 {
1839 if (mode == DImode)
1840 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1841 else
1842 {
1843 tp = gen_lowpart (mode, tp);
1844 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1845 }
1846 }
1847 else
1848 {
1849 gcc_assert (mode == Pmode);
1850 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1851 }
1852
1853 if (REG_P (dest))
1854 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1855 return;
1856 }
1857
1858 default:
1859 gcc_unreachable ();
1860 }
1861 }
1862
1863 /* Emit a move from SRC to DEST. Assume that the move expanders can
1864 handle all moves if !can_create_pseudo_p (). The distinction is
1865 important because, unlike emit_move_insn, the move expanders know
1866 how to force Pmode objects into the constant pool even when the
1867 constant pool address is not itself legitimate. */
1868 static rtx
1869 aarch64_emit_move (rtx dest, rtx src)
1870 {
1871 return (can_create_pseudo_p ()
1872 ? emit_move_insn (dest, src)
1873 : emit_move_insn_1 (dest, src));
1874 }
1875
1876 /* Apply UNOPTAB to OP and store the result in DEST. */
1877
1878 static void
1879 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1880 {
1881 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1882 if (dest != tmp)
1883 emit_move_insn (dest, tmp);
1884 }
1885
1886 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1887
1888 static void
1889 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1890 {
1891 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1892 OPTAB_DIRECT);
1893 if (dest != tmp)
1894 emit_move_insn (dest, tmp);
1895 }
1896
1897 /* Split a 128-bit move operation into two 64-bit move operations,
1898 taking care to handle partial overlap of register to register
1899 copies. Special cases are needed when moving between GP regs and
1900 FP regs. SRC can be a register, constant or memory; DST a register
1901 or memory. If either operand is memory it must not have any side
1902 effects. */
1903 void
1904 aarch64_split_128bit_move (rtx dst, rtx src)
1905 {
1906 rtx dst_lo, dst_hi;
1907 rtx src_lo, src_hi;
1908
1909 machine_mode mode = GET_MODE (dst);
1910
1911 gcc_assert (mode == TImode || mode == TFmode);
1912 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1913 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1914
1915 if (REG_P (dst) && REG_P (src))
1916 {
1917 int src_regno = REGNO (src);
1918 int dst_regno = REGNO (dst);
1919
1920 /* Handle FP <-> GP regs. */
1921 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1922 {
1923 src_lo = gen_lowpart (word_mode, src);
1924 src_hi = gen_highpart (word_mode, src);
1925
1926 if (mode == TImode)
1927 {
1928 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1929 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1930 }
1931 else
1932 {
1933 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1934 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1935 }
1936 return;
1937 }
1938 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1939 {
1940 dst_lo = gen_lowpart (word_mode, dst);
1941 dst_hi = gen_highpart (word_mode, dst);
1942
1943 if (mode == TImode)
1944 {
1945 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1946 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1947 }
1948 else
1949 {
1950 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1951 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1952 }
1953 return;
1954 }
1955 }
1956
1957 dst_lo = gen_lowpart (word_mode, dst);
1958 dst_hi = gen_highpart (word_mode, dst);
1959 src_lo = gen_lowpart (word_mode, src);
1960 src_hi = gen_highpart_mode (word_mode, mode, src);
1961
1962 /* At most one pairing may overlap. */
1963 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1964 {
1965 aarch64_emit_move (dst_hi, src_hi);
1966 aarch64_emit_move (dst_lo, src_lo);
1967 }
1968 else
1969 {
1970 aarch64_emit_move (dst_lo, src_lo);
1971 aarch64_emit_move (dst_hi, src_hi);
1972 }
1973 }
1974
1975 bool
1976 aarch64_split_128bit_move_p (rtx dst, rtx src)
1977 {
1978 return (! REG_P (src)
1979 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1980 }
1981
1982 /* Split a complex SIMD combine. */
1983
1984 void
1985 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1986 {
1987 machine_mode src_mode = GET_MODE (src1);
1988 machine_mode dst_mode = GET_MODE (dst);
1989
1990 gcc_assert (VECTOR_MODE_P (dst_mode));
1991 gcc_assert (register_operand (dst, dst_mode)
1992 && register_operand (src1, src_mode)
1993 && register_operand (src2, src_mode));
1994
1995 rtx (*gen) (rtx, rtx, rtx);
1996
1997 switch (src_mode)
1998 {
1999 case E_V8QImode:
2000 gen = gen_aarch64_simd_combinev8qi;
2001 break;
2002 case E_V4HImode:
2003 gen = gen_aarch64_simd_combinev4hi;
2004 break;
2005 case E_V2SImode:
2006 gen = gen_aarch64_simd_combinev2si;
2007 break;
2008 case E_V4HFmode:
2009 gen = gen_aarch64_simd_combinev4hf;
2010 break;
2011 case E_V2SFmode:
2012 gen = gen_aarch64_simd_combinev2sf;
2013 break;
2014 case E_DImode:
2015 gen = gen_aarch64_simd_combinedi;
2016 break;
2017 case E_DFmode:
2018 gen = gen_aarch64_simd_combinedf;
2019 break;
2020 default:
2021 gcc_unreachable ();
2022 }
2023
2024 emit_insn (gen (dst, src1, src2));
2025 return;
2026 }
2027
2028 /* Split a complex SIMD move. */
2029
2030 void
2031 aarch64_split_simd_move (rtx dst, rtx src)
2032 {
2033 machine_mode src_mode = GET_MODE (src);
2034 machine_mode dst_mode = GET_MODE (dst);
2035
2036 gcc_assert (VECTOR_MODE_P (dst_mode));
2037
2038 if (REG_P (dst) && REG_P (src))
2039 {
2040 rtx (*gen) (rtx, rtx);
2041
2042 gcc_assert (VECTOR_MODE_P (src_mode));
2043
2044 switch (src_mode)
2045 {
2046 case E_V16QImode:
2047 gen = gen_aarch64_split_simd_movv16qi;
2048 break;
2049 case E_V8HImode:
2050 gen = gen_aarch64_split_simd_movv8hi;
2051 break;
2052 case E_V4SImode:
2053 gen = gen_aarch64_split_simd_movv4si;
2054 break;
2055 case E_V2DImode:
2056 gen = gen_aarch64_split_simd_movv2di;
2057 break;
2058 case E_V8HFmode:
2059 gen = gen_aarch64_split_simd_movv8hf;
2060 break;
2061 case E_V4SFmode:
2062 gen = gen_aarch64_split_simd_movv4sf;
2063 break;
2064 case E_V2DFmode:
2065 gen = gen_aarch64_split_simd_movv2df;
2066 break;
2067 default:
2068 gcc_unreachable ();
2069 }
2070
2071 emit_insn (gen (dst, src));
2072 return;
2073 }
2074 }
2075
2076 bool
2077 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2078 machine_mode ymode, rtx y)
2079 {
2080 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2081 gcc_assert (r != NULL);
2082 return rtx_equal_p (x, r);
2083 }
2084
2085
2086 static rtx
2087 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2088 {
2089 if (can_create_pseudo_p ())
2090 return force_reg (mode, value);
2091 else
2092 {
2093 gcc_assert (x);
2094 aarch64_emit_move (x, value);
2095 return x;
2096 }
2097 }
2098
2099 /* Return true if we can move VALUE into a register using a single
2100 CNT[BHWD] instruction. */
2101
2102 static bool
2103 aarch64_sve_cnt_immediate_p (poly_int64 value)
2104 {
2105 HOST_WIDE_INT factor = value.coeffs[0];
2106 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2107 return (value.coeffs[1] == factor
2108 && IN_RANGE (factor, 2, 16 * 16)
2109 && (factor & 1) == 0
2110 && factor <= 16 * (factor & -factor));
2111 }
2112
2113 /* Likewise for rtx X. */
2114
2115 bool
2116 aarch64_sve_cnt_immediate_p (rtx x)
2117 {
2118 poly_int64 value;
2119 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2120 }
2121
2122 /* Return the asm string for an instruction with a CNT-like vector size
2123 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2124 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2125 first part of the operands template (the part that comes before the
2126 vector size itself). FACTOR is the number of quadwords.
2127 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2128 If it is zero, we can use any element size. */
2129
2130 static char *
2131 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2132 unsigned int factor,
2133 unsigned int nelts_per_vq)
2134 {
2135 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2136
2137 if (nelts_per_vq == 0)
2138 /* There is some overlap in the ranges of the four CNT instructions.
2139 Here we always use the smallest possible element size, so that the
2140 multiplier is 1 whereever possible. */
2141 nelts_per_vq = factor & -factor;
2142 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2143 gcc_assert (IN_RANGE (shift, 1, 4));
2144 char suffix = "dwhb"[shift - 1];
2145
2146 factor >>= shift;
2147 unsigned int written;
2148 if (factor == 1)
2149 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2150 prefix, suffix, operands);
2151 else
2152 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2153 prefix, suffix, operands, factor);
2154 gcc_assert (written < sizeof (buffer));
2155 return buffer;
2156 }
2157
2158 /* Return the asm string for an instruction with a CNT-like vector size
2159 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2160 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2161 first part of the operands template (the part that comes before the
2162 vector size itself). X is the value of the vector size operand,
2163 as a polynomial integer rtx. */
2164
2165 char *
2166 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2167 rtx x)
2168 {
2169 poly_int64 value = rtx_to_poly_int64 (x);
2170 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2171 return aarch64_output_sve_cnt_immediate (prefix, operands,
2172 value.coeffs[1], 0);
2173 }
2174
2175 /* Return true if we can add VALUE to a register using a single ADDVL
2176 or ADDPL instruction. */
2177
2178 static bool
2179 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2180 {
2181 HOST_WIDE_INT factor = value.coeffs[0];
2182 if (factor == 0 || value.coeffs[1] != factor)
2183 return false;
2184 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2185 and a value of 16 is one vector width. */
2186 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2187 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2188 }
2189
2190 /* Likewise for rtx X. */
2191
2192 bool
2193 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2194 {
2195 poly_int64 value;
2196 return (poly_int_rtx_p (x, &value)
2197 && aarch64_sve_addvl_addpl_immediate_p (value));
2198 }
2199
2200 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2201 and storing the result in operand 0. */
2202
2203 char *
2204 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2205 {
2206 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2207 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2208 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2209
2210 /* Use INC or DEC if possible. */
2211 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2212 {
2213 if (aarch64_sve_cnt_immediate_p (offset_value))
2214 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2215 offset_value.coeffs[1], 0);
2216 if (aarch64_sve_cnt_immediate_p (-offset_value))
2217 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2218 -offset_value.coeffs[1], 0);
2219 }
2220
2221 int factor = offset_value.coeffs[1];
2222 if ((factor & 15) == 0)
2223 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2224 else
2225 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2226 return buffer;
2227 }
2228
2229 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2230 instruction. If it is, store the number of elements in each vector
2231 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2232 factor in *FACTOR_OUT (if nonnull). */
2233
2234 bool
2235 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2236 unsigned int *nelts_per_vq_out)
2237 {
2238 rtx elt;
2239 poly_int64 value;
2240
2241 if (!const_vec_duplicate_p (x, &elt)
2242 || !poly_int_rtx_p (elt, &value))
2243 return false;
2244
2245 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2246 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2247 /* There's no vector INCB. */
2248 return false;
2249
2250 HOST_WIDE_INT factor = value.coeffs[0];
2251 if (value.coeffs[1] != factor)
2252 return false;
2253
2254 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2255 if ((factor % nelts_per_vq) != 0
2256 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2257 return false;
2258
2259 if (factor_out)
2260 *factor_out = factor;
2261 if (nelts_per_vq_out)
2262 *nelts_per_vq_out = nelts_per_vq;
2263 return true;
2264 }
2265
2266 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2267 instruction. */
2268
2269 bool
2270 aarch64_sve_inc_dec_immediate_p (rtx x)
2271 {
2272 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2273 }
2274
2275 /* Return the asm template for an SVE vector INC or DEC instruction.
2276 OPERANDS gives the operands before the vector count and X is the
2277 value of the vector count operand itself. */
2278
2279 char *
2280 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2281 {
2282 int factor;
2283 unsigned int nelts_per_vq;
2284 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2285 gcc_unreachable ();
2286 if (factor < 0)
2287 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2288 nelts_per_vq);
2289 else
2290 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2291 nelts_per_vq);
2292 }
2293
2294 static int
2295 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2296 scalar_int_mode mode)
2297 {
2298 int i;
2299 unsigned HOST_WIDE_INT val, val2, mask;
2300 int one_match, zero_match;
2301 int num_insns;
2302
2303 val = INTVAL (imm);
2304
2305 if (aarch64_move_imm (val, mode))
2306 {
2307 if (generate)
2308 emit_insn (gen_rtx_SET (dest, imm));
2309 return 1;
2310 }
2311
2312 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2313 (with XXXX non-zero). In that case check to see if the move can be done in
2314 a smaller mode. */
2315 val2 = val & 0xffffffff;
2316 if (mode == DImode
2317 && aarch64_move_imm (val2, SImode)
2318 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2319 {
2320 if (generate)
2321 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2322
2323 /* Check if we have to emit a second instruction by checking to see
2324 if any of the upper 32 bits of the original DI mode value is set. */
2325 if (val == val2)
2326 return 1;
2327
2328 i = (val >> 48) ? 48 : 32;
2329
2330 if (generate)
2331 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2332 GEN_INT ((val >> i) & 0xffff)));
2333
2334 return 2;
2335 }
2336
2337 if ((val >> 32) == 0 || mode == SImode)
2338 {
2339 if (generate)
2340 {
2341 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2342 if (mode == SImode)
2343 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2344 GEN_INT ((val >> 16) & 0xffff)));
2345 else
2346 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2347 GEN_INT ((val >> 16) & 0xffff)));
2348 }
2349 return 2;
2350 }
2351
2352 /* Remaining cases are all for DImode. */
2353
2354 mask = 0xffff;
2355 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2356 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2357 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2358 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2359
2360 if (zero_match != 2 && one_match != 2)
2361 {
2362 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2363 For a 64-bit bitmask try whether changing 16 bits to all ones or
2364 zeroes creates a valid bitmask. To check any repeated bitmask,
2365 try using 16 bits from the other 32-bit half of val. */
2366
2367 for (i = 0; i < 64; i += 16, mask <<= 16)
2368 {
2369 val2 = val & ~mask;
2370 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2371 break;
2372 val2 = val | mask;
2373 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2374 break;
2375 val2 = val2 & ~mask;
2376 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2377 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2378 break;
2379 }
2380 if (i != 64)
2381 {
2382 if (generate)
2383 {
2384 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2385 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2386 GEN_INT ((val >> i) & 0xffff)));
2387 }
2388 return 2;
2389 }
2390 }
2391
2392 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2393 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2394 otherwise skip zero bits. */
2395
2396 num_insns = 1;
2397 mask = 0xffff;
2398 val2 = one_match > zero_match ? ~val : val;
2399 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2400
2401 if (generate)
2402 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2403 ? (val | ~(mask << i))
2404 : (val & (mask << i)))));
2405 for (i += 16; i < 64; i += 16)
2406 {
2407 if ((val2 & (mask << i)) == 0)
2408 continue;
2409 if (generate)
2410 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2411 GEN_INT ((val >> i) & 0xffff)));
2412 num_insns ++;
2413 }
2414
2415 return num_insns;
2416 }
2417
2418 /* Return whether imm is a 128-bit immediate which is simple enough to
2419 expand inline. */
2420 bool
2421 aarch64_mov128_immediate (rtx imm)
2422 {
2423 if (GET_CODE (imm) == CONST_INT)
2424 return true;
2425
2426 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2427
2428 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2429 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2430
2431 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2432 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2433 }
2434
2435
2436 /* Return the number of temporary registers that aarch64_add_offset_1
2437 would need to add OFFSET to a register. */
2438
2439 static unsigned int
2440 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2441 {
2442 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2443 }
2444
2445 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2446 a non-polynomial OFFSET. MODE is the mode of the addition.
2447 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2448 be set and CFA adjustments added to the generated instructions.
2449
2450 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2451 temporary if register allocation is already complete. This temporary
2452 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2453 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2454 the immediate again.
2455
2456 Since this function may be used to adjust the stack pointer, we must
2457 ensure that it cannot cause transient stack deallocation (for example
2458 by first incrementing SP and then decrementing when adjusting by a
2459 large immediate). */
2460
2461 static void
2462 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2463 rtx src, HOST_WIDE_INT offset, rtx temp1,
2464 bool frame_related_p, bool emit_move_imm)
2465 {
2466 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2467 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2468
2469 HOST_WIDE_INT moffset = abs_hwi (offset);
2470 rtx_insn *insn;
2471
2472 if (!moffset)
2473 {
2474 if (!rtx_equal_p (dest, src))
2475 {
2476 insn = emit_insn (gen_rtx_SET (dest, src));
2477 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2478 }
2479 return;
2480 }
2481
2482 /* Single instruction adjustment. */
2483 if (aarch64_uimm12_shift (moffset))
2484 {
2485 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2486 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2487 return;
2488 }
2489
2490 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2491 and either:
2492
2493 a) the offset cannot be loaded by a 16-bit move or
2494 b) there is no spare register into which we can move it. */
2495 if (moffset < 0x1000000
2496 && ((!temp1 && !can_create_pseudo_p ())
2497 || !aarch64_move_imm (moffset, mode)))
2498 {
2499 HOST_WIDE_INT low_off = moffset & 0xfff;
2500
2501 low_off = offset < 0 ? -low_off : low_off;
2502 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2503 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2504 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2505 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2506 return;
2507 }
2508
2509 /* Emit a move immediate if required and an addition/subtraction. */
2510 if (emit_move_imm)
2511 {
2512 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2513 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2514 }
2515 insn = emit_insn (offset < 0
2516 ? gen_sub3_insn (dest, src, temp1)
2517 : gen_add3_insn (dest, src, temp1));
2518 if (frame_related_p)
2519 {
2520 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521 rtx adj = plus_constant (mode, src, offset);
2522 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2523 }
2524 }
2525
2526 /* Return the number of temporary registers that aarch64_add_offset
2527 would need to move OFFSET into a register or add OFFSET to a register;
2528 ADD_P is true if we want the latter rather than the former. */
2529
2530 static unsigned int
2531 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2532 {
2533 /* This follows the same structure as aarch64_add_offset. */
2534 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2535 return 0;
2536
2537 unsigned int count = 0;
2538 HOST_WIDE_INT factor = offset.coeffs[1];
2539 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2540 poly_int64 poly_offset (factor, factor);
2541 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2542 /* Need one register for the ADDVL/ADDPL result. */
2543 count += 1;
2544 else if (factor != 0)
2545 {
2546 factor = abs (factor);
2547 if (factor > 16 * (factor & -factor))
2548 /* Need one register for the CNT result and one for the multiplication
2549 factor. If necessary, the second temporary can be reused for the
2550 constant part of the offset. */
2551 return 2;
2552 /* Need one register for the CNT result (which might then
2553 be shifted). */
2554 count += 1;
2555 }
2556 return count + aarch64_add_offset_1_temporaries (constant);
2557 }
2558
2559 /* If X can be represented as a poly_int64, return the number
2560 of temporaries that are required to add it to a register.
2561 Return -1 otherwise. */
2562
2563 int
2564 aarch64_add_offset_temporaries (rtx x)
2565 {
2566 poly_int64 offset;
2567 if (!poly_int_rtx_p (x, &offset))
2568 return -1;
2569 return aarch64_offset_temporaries (true, offset);
2570 }
2571
2572 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2573 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2574 be set and CFA adjustments added to the generated instructions.
2575
2576 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2577 temporary if register allocation is already complete. This temporary
2578 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2579 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2580 false to avoid emitting the immediate again.
2581
2582 TEMP2, if nonnull, is a second temporary register that doesn't
2583 overlap either DEST or REG.
2584
2585 Since this function may be used to adjust the stack pointer, we must
2586 ensure that it cannot cause transient stack deallocation (for example
2587 by first incrementing SP and then decrementing when adjusting by a
2588 large immediate). */
2589
2590 static void
2591 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2592 poly_int64 offset, rtx temp1, rtx temp2,
2593 bool frame_related_p, bool emit_move_imm = true)
2594 {
2595 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2596 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2597 gcc_assert (temp1 == NULL_RTX
2598 || !frame_related_p
2599 || !reg_overlap_mentioned_p (temp1, dest));
2600 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2601
2602 /* Try using ADDVL or ADDPL to add the whole value. */
2603 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2604 {
2605 rtx offset_rtx = gen_int_mode (offset, mode);
2606 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2607 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2608 return;
2609 }
2610
2611 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2612 SVE vector register, over and above the minimum size of 128 bits.
2613 This is equivalent to half the value returned by CNTD with a
2614 vector shape of ALL. */
2615 HOST_WIDE_INT factor = offset.coeffs[1];
2616 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2617
2618 /* Try using ADDVL or ADDPL to add the VG-based part. */
2619 poly_int64 poly_offset (factor, factor);
2620 if (src != const0_rtx
2621 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2622 {
2623 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2624 if (frame_related_p)
2625 {
2626 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2627 RTX_FRAME_RELATED_P (insn) = true;
2628 src = dest;
2629 }
2630 else
2631 {
2632 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2633 src = aarch64_force_temporary (mode, temp1, addr);
2634 temp1 = temp2;
2635 temp2 = NULL_RTX;
2636 }
2637 }
2638 /* Otherwise use a CNT-based sequence. */
2639 else if (factor != 0)
2640 {
2641 /* Use a subtraction if we have a negative factor. */
2642 rtx_code code = PLUS;
2643 if (factor < 0)
2644 {
2645 factor = -factor;
2646 code = MINUS;
2647 }
2648
2649 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2650 into the multiplication. */
2651 rtx val;
2652 int shift = 0;
2653 if (factor & 1)
2654 /* Use a right shift by 1. */
2655 shift = -1;
2656 else
2657 factor /= 2;
2658 HOST_WIDE_INT low_bit = factor & -factor;
2659 if (factor <= 16 * low_bit)
2660 {
2661 if (factor > 16 * 8)
2662 {
2663 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2664 the value with the minimum multiplier and shift it into
2665 position. */
2666 int extra_shift = exact_log2 (low_bit);
2667 shift += extra_shift;
2668 factor >>= extra_shift;
2669 }
2670 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2671 }
2672 else
2673 {
2674 /* Use CNTD, then multiply it by FACTOR. */
2675 val = gen_int_mode (poly_int64 (2, 2), mode);
2676 val = aarch64_force_temporary (mode, temp1, val);
2677
2678 /* Go back to using a negative multiplication factor if we have
2679 no register from which to subtract. */
2680 if (code == MINUS && src == const0_rtx)
2681 {
2682 factor = -factor;
2683 code = PLUS;
2684 }
2685 rtx coeff1 = gen_int_mode (factor, mode);
2686 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2687 val = gen_rtx_MULT (mode, val, coeff1);
2688 }
2689
2690 if (shift > 0)
2691 {
2692 /* Multiply by 1 << SHIFT. */
2693 val = aarch64_force_temporary (mode, temp1, val);
2694 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2695 }
2696 else if (shift == -1)
2697 {
2698 /* Divide by 2. */
2699 val = aarch64_force_temporary (mode, temp1, val);
2700 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2701 }
2702
2703 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2704 if (src != const0_rtx)
2705 {
2706 val = aarch64_force_temporary (mode, temp1, val);
2707 val = gen_rtx_fmt_ee (code, mode, src, val);
2708 }
2709 else if (code == MINUS)
2710 {
2711 val = aarch64_force_temporary (mode, temp1, val);
2712 val = gen_rtx_NEG (mode, val);
2713 }
2714
2715 if (constant == 0 || frame_related_p)
2716 {
2717 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2718 if (frame_related_p)
2719 {
2720 RTX_FRAME_RELATED_P (insn) = true;
2721 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2722 gen_rtx_SET (dest, plus_constant (Pmode, src,
2723 poly_offset)));
2724 }
2725 src = dest;
2726 if (constant == 0)
2727 return;
2728 }
2729 else
2730 {
2731 src = aarch64_force_temporary (mode, temp1, val);
2732 temp1 = temp2;
2733 temp2 = NULL_RTX;
2734 }
2735
2736 emit_move_imm = true;
2737 }
2738
2739 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2740 frame_related_p, emit_move_imm);
2741 }
2742
2743 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2744 than a poly_int64. */
2745
2746 void
2747 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2748 rtx offset_rtx, rtx temp1, rtx temp2)
2749 {
2750 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2751 temp1, temp2, false);
2752 }
2753
2754 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2755 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2756 if TEMP1 already contains abs (DELTA). */
2757
2758 static inline void
2759 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2760 {
2761 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2762 temp1, temp2, true, emit_move_imm);
2763 }
2764
2765 /* Subtract DELTA from the stack pointer, marking the instructions
2766 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2767 if nonnull. */
2768
2769 static inline void
2770 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2771 {
2772 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2773 temp1, temp2, frame_related_p);
2774 }
2775
2776 /* Set DEST to (vec_series BASE STEP). */
2777
2778 static void
2779 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2780 {
2781 machine_mode mode = GET_MODE (dest);
2782 scalar_mode inner = GET_MODE_INNER (mode);
2783
2784 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2785 if (!aarch64_sve_index_immediate_p (base))
2786 base = force_reg (inner, base);
2787 if (!aarch64_sve_index_immediate_p (step))
2788 step = force_reg (inner, step);
2789
2790 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2791 }
2792
2793 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2794 integer of mode INT_MODE. Return true on success. */
2795
2796 static bool
2797 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2798 rtx src)
2799 {
2800 /* If the constant is smaller than 128 bits, we can do the move
2801 using a vector of SRC_MODEs. */
2802 if (src_mode != TImode)
2803 {
2804 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2805 GET_MODE_SIZE (src_mode));
2806 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2807 emit_move_insn (gen_lowpart (dup_mode, dest),
2808 gen_const_vec_duplicate (dup_mode, src));
2809 return true;
2810 }
2811
2812 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2813 src = force_const_mem (src_mode, src);
2814 if (!src)
2815 return false;
2816
2817 /* Make sure that the address is legitimate. */
2818 if (!aarch64_sve_ld1r_operand_p (src))
2819 {
2820 rtx addr = force_reg (Pmode, XEXP (src, 0));
2821 src = replace_equiv_address (src, addr);
2822 }
2823
2824 machine_mode mode = GET_MODE (dest);
2825 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2826 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2827 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2828 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2829 emit_insn (gen_rtx_SET (dest, src));
2830 return true;
2831 }
2832
2833 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2834 isn't a simple duplicate or series. */
2835
2836 static void
2837 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2838 {
2839 machine_mode mode = GET_MODE (src);
2840 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2841 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2842 gcc_assert (npatterns > 1);
2843
2844 if (nelts_per_pattern == 1)
2845 {
2846 /* The constant is a repeating seqeuence of at least two elements,
2847 where the repeating elements occupy no more than 128 bits.
2848 Get an integer representation of the replicated value. */
2849 scalar_int_mode int_mode;
2850 if (BYTES_BIG_ENDIAN)
2851 /* For now, always use LD1RQ to load the value on big-endian
2852 targets, since the handling of smaller integers includes a
2853 subreg that is semantically an element reverse. */
2854 int_mode = TImode;
2855 else
2856 {
2857 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2858 gcc_assert (int_bits <= 128);
2859 int_mode = int_mode_for_size (int_bits, 0).require ();
2860 }
2861 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2862 if (int_value
2863 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2864 return;
2865 }
2866
2867 /* Expand each pattern individually. */
2868 rtx_vector_builder builder;
2869 auto_vec<rtx, 16> vectors (npatterns);
2870 for (unsigned int i = 0; i < npatterns; ++i)
2871 {
2872 builder.new_vector (mode, 1, nelts_per_pattern);
2873 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2874 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2875 vectors.quick_push (force_reg (mode, builder.build ()));
2876 }
2877
2878 /* Use permutes to interleave the separate vectors. */
2879 while (npatterns > 1)
2880 {
2881 npatterns /= 2;
2882 for (unsigned int i = 0; i < npatterns; ++i)
2883 {
2884 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2885 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2886 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2887 vectors[i] = tmp;
2888 }
2889 }
2890 gcc_assert (vectors[0] == dest);
2891 }
2892
2893 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2894 is a pattern that can be used to set DEST to a replicated scalar
2895 element. */
2896
2897 void
2898 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2899 rtx (*gen_vec_duplicate) (rtx, rtx))
2900 {
2901 machine_mode mode = GET_MODE (dest);
2902
2903 /* Check on what type of symbol it is. */
2904 scalar_int_mode int_mode;
2905 if ((GET_CODE (imm) == SYMBOL_REF
2906 || GET_CODE (imm) == LABEL_REF
2907 || GET_CODE (imm) == CONST
2908 || GET_CODE (imm) == CONST_POLY_INT)
2909 && is_a <scalar_int_mode> (mode, &int_mode))
2910 {
2911 rtx mem;
2912 poly_int64 offset;
2913 HOST_WIDE_INT const_offset;
2914 enum aarch64_symbol_type sty;
2915
2916 /* If we have (const (plus symbol offset)), separate out the offset
2917 before we start classifying the symbol. */
2918 rtx base = strip_offset (imm, &offset);
2919
2920 /* We must always add an offset involving VL separately, rather than
2921 folding it into the relocation. */
2922 if (!offset.is_constant (&const_offset))
2923 {
2924 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2925 emit_insn (gen_rtx_SET (dest, imm));
2926 else
2927 {
2928 /* Do arithmetic on 32-bit values if the result is smaller
2929 than that. */
2930 if (partial_subreg_p (int_mode, SImode))
2931 {
2932 /* It is invalid to do symbol calculations in modes
2933 narrower than SImode. */
2934 gcc_assert (base == const0_rtx);
2935 dest = gen_lowpart (SImode, dest);
2936 int_mode = SImode;
2937 }
2938 if (base != const0_rtx)
2939 {
2940 base = aarch64_force_temporary (int_mode, dest, base);
2941 aarch64_add_offset (int_mode, dest, base, offset,
2942 NULL_RTX, NULL_RTX, false);
2943 }
2944 else
2945 aarch64_add_offset (int_mode, dest, base, offset,
2946 dest, NULL_RTX, false);
2947 }
2948 return;
2949 }
2950
2951 sty = aarch64_classify_symbol (base, const_offset);
2952 switch (sty)
2953 {
2954 case SYMBOL_FORCE_TO_MEM:
2955 if (const_offset != 0
2956 && targetm.cannot_force_const_mem (int_mode, imm))
2957 {
2958 gcc_assert (can_create_pseudo_p ());
2959 base = aarch64_force_temporary (int_mode, dest, base);
2960 aarch64_add_offset (int_mode, dest, base, const_offset,
2961 NULL_RTX, NULL_RTX, false);
2962 return;
2963 }
2964
2965 mem = force_const_mem (ptr_mode, imm);
2966 gcc_assert (mem);
2967
2968 /* If we aren't generating PC relative literals, then
2969 we need to expand the literal pool access carefully.
2970 This is something that needs to be done in a number
2971 of places, so could well live as a separate function. */
2972 if (!aarch64_pcrelative_literal_loads)
2973 {
2974 gcc_assert (can_create_pseudo_p ());
2975 base = gen_reg_rtx (ptr_mode);
2976 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2977 if (ptr_mode != Pmode)
2978 base = convert_memory_address (Pmode, base);
2979 mem = gen_rtx_MEM (ptr_mode, base);
2980 }
2981
2982 if (int_mode != ptr_mode)
2983 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2984
2985 emit_insn (gen_rtx_SET (dest, mem));
2986
2987 return;
2988
2989 case SYMBOL_SMALL_TLSGD:
2990 case SYMBOL_SMALL_TLSDESC:
2991 case SYMBOL_SMALL_TLSIE:
2992 case SYMBOL_SMALL_GOT_28K:
2993 case SYMBOL_SMALL_GOT_4G:
2994 case SYMBOL_TINY_GOT:
2995 case SYMBOL_TINY_TLSIE:
2996 if (const_offset != 0)
2997 {
2998 gcc_assert(can_create_pseudo_p ());
2999 base = aarch64_force_temporary (int_mode, dest, base);
3000 aarch64_add_offset (int_mode, dest, base, const_offset,
3001 NULL_RTX, NULL_RTX, false);
3002 return;
3003 }
3004 /* FALLTHRU */
3005
3006 case SYMBOL_SMALL_ABSOLUTE:
3007 case SYMBOL_TINY_ABSOLUTE:
3008 case SYMBOL_TLSLE12:
3009 case SYMBOL_TLSLE24:
3010 case SYMBOL_TLSLE32:
3011 case SYMBOL_TLSLE48:
3012 aarch64_load_symref_appropriately (dest, imm, sty);
3013 return;
3014
3015 default:
3016 gcc_unreachable ();
3017 }
3018 }
3019
3020 if (!CONST_INT_P (imm))
3021 {
3022 rtx base, step, value;
3023 if (GET_CODE (imm) == HIGH
3024 || aarch64_simd_valid_immediate (imm, NULL))
3025 emit_insn (gen_rtx_SET (dest, imm));
3026 else if (const_vec_series_p (imm, &base, &step))
3027 aarch64_expand_vec_series (dest, base, step);
3028 else if (const_vec_duplicate_p (imm, &value))
3029 {
3030 /* If the constant is out of range of an SVE vector move,
3031 load it from memory if we can, otherwise move it into
3032 a register and use a DUP. */
3033 scalar_mode inner_mode = GET_MODE_INNER (mode);
3034 rtx op = force_const_mem (inner_mode, value);
3035 if (!op)
3036 op = force_reg (inner_mode, value);
3037 else if (!aarch64_sve_ld1r_operand_p (op))
3038 {
3039 rtx addr = force_reg (Pmode, XEXP (op, 0));
3040 op = replace_equiv_address (op, addr);
3041 }
3042 emit_insn (gen_vec_duplicate (dest, op));
3043 }
3044 else if (GET_CODE (imm) == CONST_VECTOR
3045 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3046 aarch64_expand_sve_const_vector (dest, imm);
3047 else
3048 {
3049 rtx mem = force_const_mem (mode, imm);
3050 gcc_assert (mem);
3051 emit_move_insn (dest, mem);
3052 }
3053
3054 return;
3055 }
3056
3057 aarch64_internal_mov_immediate (dest, imm, true,
3058 as_a <scalar_int_mode> (mode));
3059 }
3060
3061 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3062 that is known to contain PTRUE. */
3063
3064 void
3065 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3066 {
3067 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3068 gen_rtvec (2, pred, src),
3069 UNSPEC_MERGE_PTRUE)));
3070 }
3071
3072 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3073 operand is in memory. In this case we need to use the predicated LD1
3074 and ST1 instead of LDR and STR, both for correctness on big-endian
3075 targets and because LD1 and ST1 support a wider range of addressing modes.
3076 PRED_MODE is the mode of the predicate.
3077
3078 See the comment at the head of aarch64-sve.md for details about the
3079 big-endian handling. */
3080
3081 void
3082 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3083 {
3084 machine_mode mode = GET_MODE (dest);
3085 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3086 if (!register_operand (src, mode)
3087 && !register_operand (dest, mode))
3088 {
3089 rtx tmp = gen_reg_rtx (mode);
3090 if (MEM_P (src))
3091 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3092 else
3093 emit_move_insn (tmp, src);
3094 src = tmp;
3095 }
3096 aarch64_emit_sve_pred_move (dest, ptrue, src);
3097 }
3098
3099 /* Called only on big-endian targets. See whether an SVE vector move
3100 from SRC to DEST is effectively a REV[BHW] instruction, because at
3101 least one operand is a subreg of an SVE vector that has wider or
3102 narrower elements. Return true and emit the instruction if so.
3103
3104 For example:
3105
3106 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3107
3108 represents a VIEW_CONVERT between the following vectors, viewed
3109 in memory order:
3110
3111 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3112 R1: { [0], [1], [2], [3], ... }
3113
3114 The high part of lane X in R2 should therefore correspond to lane X*2
3115 of R1, but the register representations are:
3116
3117 msb lsb
3118 R2: ...... [1].high [1].low [0].high [0].low
3119 R1: ...... [3] [2] [1] [0]
3120
3121 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3122 We therefore need a reverse operation to swap the high and low values
3123 around.
3124
3125 This is purely an optimization. Without it we would spill the
3126 subreg operand to the stack in one mode and reload it in the
3127 other mode, which has the same effect as the REV. */
3128
3129 bool
3130 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3131 {
3132 gcc_assert (BYTES_BIG_ENDIAN);
3133 if (GET_CODE (dest) == SUBREG)
3134 dest = SUBREG_REG (dest);
3135 if (GET_CODE (src) == SUBREG)
3136 src = SUBREG_REG (src);
3137
3138 /* The optimization handles two single SVE REGs with different element
3139 sizes. */
3140 if (!REG_P (dest)
3141 || !REG_P (src)
3142 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3143 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3144 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3145 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3146 return false;
3147
3148 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3149 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3150 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3151 UNSPEC_REV_SUBREG);
3152 emit_insn (gen_rtx_SET (dest, unspec));
3153 return true;
3154 }
3155
3156 /* Return a copy of X with mode MODE, without changing its other
3157 attributes. Unlike gen_lowpart, this doesn't care whether the
3158 mode change is valid. */
3159
3160 static rtx
3161 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3162 {
3163 if (GET_MODE (x) == mode)
3164 return x;
3165
3166 x = shallow_copy_rtx (x);
3167 set_mode_and_regno (x, mode, REGNO (x));
3168 return x;
3169 }
3170
3171 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3172 operands. */
3173
3174 void
3175 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3176 {
3177 /* Decide which REV operation we need. The mode with narrower elements
3178 determines the mode of the operands and the mode with the wider
3179 elements determines the reverse width. */
3180 machine_mode mode_with_wider_elts = GET_MODE (dest);
3181 machine_mode mode_with_narrower_elts = GET_MODE (src);
3182 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3183 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3184 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3185
3186 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3187 unsigned int unspec;
3188 if (wider_bytes == 8)
3189 unspec = UNSPEC_REV64;
3190 else if (wider_bytes == 4)
3191 unspec = UNSPEC_REV32;
3192 else if (wider_bytes == 2)
3193 unspec = UNSPEC_REV16;
3194 else
3195 gcc_unreachable ();
3196 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3197
3198 /* Emit:
3199
3200 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3201 UNSPEC_MERGE_PTRUE))
3202
3203 with the appropriate modes. */
3204 ptrue = gen_lowpart (pred_mode, ptrue);
3205 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3206 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3207 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3208 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3209 UNSPEC_MERGE_PTRUE);
3210 emit_insn (gen_rtx_SET (dest, src));
3211 }
3212
3213 static bool
3214 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3215 tree exp ATTRIBUTE_UNUSED)
3216 {
3217 /* Currently, always true. */
3218 return true;
3219 }
3220
3221 /* Implement TARGET_PASS_BY_REFERENCE. */
3222
3223 static bool
3224 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3225 machine_mode mode,
3226 const_tree type,
3227 bool named ATTRIBUTE_UNUSED)
3228 {
3229 HOST_WIDE_INT size;
3230 machine_mode dummymode;
3231 int nregs;
3232
3233 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3234 if (mode == BLKmode && type)
3235 size = int_size_in_bytes (type);
3236 else
3237 /* No frontends can create types with variable-sized modes, so we
3238 shouldn't be asked to pass or return them. */
3239 size = GET_MODE_SIZE (mode).to_constant ();
3240
3241 /* Aggregates are passed by reference based on their size. */
3242 if (type && AGGREGATE_TYPE_P (type))
3243 {
3244 size = int_size_in_bytes (type);
3245 }
3246
3247 /* Variable sized arguments are always returned by reference. */
3248 if (size < 0)
3249 return true;
3250
3251 /* Can this be a candidate to be passed in fp/simd register(s)? */
3252 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3253 &dummymode, &nregs,
3254 NULL))
3255 return false;
3256
3257 /* Arguments which are variable sized or larger than 2 registers are
3258 passed by reference unless they are a homogenous floating point
3259 aggregate. */
3260 return size > 2 * UNITS_PER_WORD;
3261 }
3262
3263 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3264 static bool
3265 aarch64_return_in_msb (const_tree valtype)
3266 {
3267 machine_mode dummy_mode;
3268 int dummy_int;
3269
3270 /* Never happens in little-endian mode. */
3271 if (!BYTES_BIG_ENDIAN)
3272 return false;
3273
3274 /* Only composite types smaller than or equal to 16 bytes can
3275 be potentially returned in registers. */
3276 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3277 || int_size_in_bytes (valtype) <= 0
3278 || int_size_in_bytes (valtype) > 16)
3279 return false;
3280
3281 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3282 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3283 is always passed/returned in the least significant bits of fp/simd
3284 register(s). */
3285 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3286 &dummy_mode, &dummy_int, NULL))
3287 return false;
3288
3289 return true;
3290 }
3291
3292 /* Implement TARGET_FUNCTION_VALUE.
3293 Define how to find the value returned by a function. */
3294
3295 static rtx
3296 aarch64_function_value (const_tree type, const_tree func,
3297 bool outgoing ATTRIBUTE_UNUSED)
3298 {
3299 machine_mode mode;
3300 int unsignedp;
3301 int count;
3302 machine_mode ag_mode;
3303
3304 mode = TYPE_MODE (type);
3305 if (INTEGRAL_TYPE_P (type))
3306 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3307
3308 if (aarch64_return_in_msb (type))
3309 {
3310 HOST_WIDE_INT size = int_size_in_bytes (type);
3311
3312 if (size % UNITS_PER_WORD != 0)
3313 {
3314 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3315 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3316 }
3317 }
3318
3319 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3320 &ag_mode, &count, NULL))
3321 {
3322 if (!aarch64_composite_type_p (type, mode))
3323 {
3324 gcc_assert (count == 1 && mode == ag_mode);
3325 return gen_rtx_REG (mode, V0_REGNUM);
3326 }
3327 else
3328 {
3329 int i;
3330 rtx par;
3331
3332 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3333 for (i = 0; i < count; i++)
3334 {
3335 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3336 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3337 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3338 XVECEXP (par, 0, i) = tmp;
3339 }
3340 return par;
3341 }
3342 }
3343 else
3344 return gen_rtx_REG (mode, R0_REGNUM);
3345 }
3346
3347 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3348 Return true if REGNO is the number of a hard register in which the values
3349 of called function may come back. */
3350
3351 static bool
3352 aarch64_function_value_regno_p (const unsigned int regno)
3353 {
3354 /* Maximum of 16 bytes can be returned in the general registers. Examples
3355 of 16-byte return values are: 128-bit integers and 16-byte small
3356 structures (excluding homogeneous floating-point aggregates). */
3357 if (regno == R0_REGNUM || regno == R1_REGNUM)
3358 return true;
3359
3360 /* Up to four fp/simd registers can return a function value, e.g. a
3361 homogeneous floating-point aggregate having four members. */
3362 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3363 return TARGET_FLOAT;
3364
3365 return false;
3366 }
3367
3368 /* Implement TARGET_RETURN_IN_MEMORY.
3369
3370 If the type T of the result of a function is such that
3371 void func (T arg)
3372 would require that arg be passed as a value in a register (or set of
3373 registers) according to the parameter passing rules, then the result
3374 is returned in the same registers as would be used for such an
3375 argument. */
3376
3377 static bool
3378 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3379 {
3380 HOST_WIDE_INT size;
3381 machine_mode ag_mode;
3382 int count;
3383
3384 if (!AGGREGATE_TYPE_P (type)
3385 && TREE_CODE (type) != COMPLEX_TYPE
3386 && TREE_CODE (type) != VECTOR_TYPE)
3387 /* Simple scalar types always returned in registers. */
3388 return false;
3389
3390 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3391 type,
3392 &ag_mode,
3393 &count,
3394 NULL))
3395 return false;
3396
3397 /* Types larger than 2 registers returned in memory. */
3398 size = int_size_in_bytes (type);
3399 return (size < 0 || size > 2 * UNITS_PER_WORD);
3400 }
3401
3402 static bool
3403 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3404 const_tree type, int *nregs)
3405 {
3406 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3407 return aarch64_vfp_is_call_or_return_candidate (mode,
3408 type,
3409 &pcum->aapcs_vfp_rmode,
3410 nregs,
3411 NULL);
3412 }
3413
3414 /* Given MODE and TYPE of a function argument, return the alignment in
3415 bits. The idea is to suppress any stronger alignment requested by
3416 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3417 This is a helper function for local use only. */
3418
3419 static unsigned int
3420 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3421 {
3422 if (!type)
3423 return GET_MODE_ALIGNMENT (mode);
3424
3425 if (integer_zerop (TYPE_SIZE (type)))
3426 return 0;
3427
3428 gcc_assert (TYPE_MODE (type) == mode);
3429
3430 if (!AGGREGATE_TYPE_P (type))
3431 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3432
3433 if (TREE_CODE (type) == ARRAY_TYPE)
3434 return TYPE_ALIGN (TREE_TYPE (type));
3435
3436 unsigned int alignment = 0;
3437 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3438 if (TREE_CODE (field) == FIELD_DECL)
3439 alignment = std::max (alignment, DECL_ALIGN (field));
3440
3441 return alignment;
3442 }
3443
3444 /* Layout a function argument according to the AAPCS64 rules. The rule
3445 numbers refer to the rule numbers in the AAPCS64. */
3446
3447 static void
3448 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3449 const_tree type,
3450 bool named ATTRIBUTE_UNUSED)
3451 {
3452 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3453 int ncrn, nvrn, nregs;
3454 bool allocate_ncrn, allocate_nvrn;
3455 HOST_WIDE_INT size;
3456
3457 /* We need to do this once per argument. */
3458 if (pcum->aapcs_arg_processed)
3459 return;
3460
3461 pcum->aapcs_arg_processed = true;
3462
3463 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3464 if (type)
3465 size = int_size_in_bytes (type);
3466 else
3467 /* No frontends can create types with variable-sized modes, so we
3468 shouldn't be asked to pass or return them. */
3469 size = GET_MODE_SIZE (mode).to_constant ();
3470 size = ROUND_UP (size, UNITS_PER_WORD);
3471
3472 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3473 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3474 mode,
3475 type,
3476 &nregs);
3477
3478 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3479 The following code thus handles passing by SIMD/FP registers first. */
3480
3481 nvrn = pcum->aapcs_nvrn;
3482
3483 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3484 and homogenous short-vector aggregates (HVA). */
3485 if (allocate_nvrn)
3486 {
3487 if (!TARGET_FLOAT)
3488 aarch64_err_no_fpadvsimd (mode, "argument");
3489
3490 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3491 {
3492 pcum->aapcs_nextnvrn = nvrn + nregs;
3493 if (!aarch64_composite_type_p (type, mode))
3494 {
3495 gcc_assert (nregs == 1);
3496 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3497 }
3498 else
3499 {
3500 rtx par;
3501 int i;
3502 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3503 for (i = 0; i < nregs; i++)
3504 {
3505 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3506 V0_REGNUM + nvrn + i);
3507 rtx offset = gen_int_mode
3508 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3509 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3510 XVECEXP (par, 0, i) = tmp;
3511 }
3512 pcum->aapcs_reg = par;
3513 }
3514 return;
3515 }
3516 else
3517 {
3518 /* C.3 NSRN is set to 8. */
3519 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3520 goto on_stack;
3521 }
3522 }
3523
3524 ncrn = pcum->aapcs_ncrn;
3525 nregs = size / UNITS_PER_WORD;
3526
3527 /* C6 - C9. though the sign and zero extension semantics are
3528 handled elsewhere. This is the case where the argument fits
3529 entirely general registers. */
3530 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3531 {
3532
3533 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3534
3535 /* C.8 if the argument has an alignment of 16 then the NGRN is
3536 rounded up to the next even number. */
3537 if (nregs == 2
3538 && ncrn % 2
3539 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3540 comparison is there because for > 16 * BITS_PER_UNIT
3541 alignment nregs should be > 2 and therefore it should be
3542 passed by reference rather than value. */
3543 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3544 {
3545 ++ncrn;
3546 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3547 }
3548
3549 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3550 A reg is still generated for it, but the caller should be smart
3551 enough not to use it. */
3552 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3553 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3554 else
3555 {
3556 rtx par;
3557 int i;
3558
3559 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3560 for (i = 0; i < nregs; i++)
3561 {
3562 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3563 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3564 GEN_INT (i * UNITS_PER_WORD));
3565 XVECEXP (par, 0, i) = tmp;
3566 }
3567 pcum->aapcs_reg = par;
3568 }
3569
3570 pcum->aapcs_nextncrn = ncrn + nregs;
3571 return;
3572 }
3573
3574 /* C.11 */
3575 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3576
3577 /* The argument is passed on stack; record the needed number of words for
3578 this argument and align the total size if necessary. */
3579 on_stack:
3580 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3581
3582 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3583 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3584 16 / UNITS_PER_WORD);
3585 return;
3586 }
3587
3588 /* Implement TARGET_FUNCTION_ARG. */
3589
3590 static rtx
3591 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3592 const_tree type, bool named)
3593 {
3594 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3595 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3596
3597 if (mode == VOIDmode)
3598 return NULL_RTX;
3599
3600 aarch64_layout_arg (pcum_v, mode, type, named);
3601 return pcum->aapcs_reg;
3602 }
3603
3604 void
3605 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3606 const_tree fntype ATTRIBUTE_UNUSED,
3607 rtx libname ATTRIBUTE_UNUSED,
3608 const_tree fndecl ATTRIBUTE_UNUSED,
3609 unsigned n_named ATTRIBUTE_UNUSED)
3610 {
3611 pcum->aapcs_ncrn = 0;
3612 pcum->aapcs_nvrn = 0;
3613 pcum->aapcs_nextncrn = 0;
3614 pcum->aapcs_nextnvrn = 0;
3615 pcum->pcs_variant = ARM_PCS_AAPCS64;
3616 pcum->aapcs_reg = NULL_RTX;
3617 pcum->aapcs_arg_processed = false;
3618 pcum->aapcs_stack_words = 0;
3619 pcum->aapcs_stack_size = 0;
3620
3621 if (!TARGET_FLOAT
3622 && fndecl && TREE_PUBLIC (fndecl)
3623 && fntype && fntype != error_mark_node)
3624 {
3625 const_tree type = TREE_TYPE (fntype);
3626 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3627 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3628 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3629 &mode, &nregs, NULL))
3630 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3631 }
3632 return;
3633 }
3634
3635 static void
3636 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3637 machine_mode mode,
3638 const_tree type,
3639 bool named)
3640 {
3641 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3642 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3643 {
3644 aarch64_layout_arg (pcum_v, mode, type, named);
3645 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3646 != (pcum->aapcs_stack_words != 0));
3647 pcum->aapcs_arg_processed = false;
3648 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3649 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3650 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3651 pcum->aapcs_stack_words = 0;
3652 pcum->aapcs_reg = NULL_RTX;
3653 }
3654 }
3655
3656 bool
3657 aarch64_function_arg_regno_p (unsigned regno)
3658 {
3659 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3660 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3661 }
3662
3663 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3664 PARM_BOUNDARY bits of alignment, but will be given anything up
3665 to STACK_BOUNDARY bits if the type requires it. This makes sure
3666 that both before and after the layout of each argument, the Next
3667 Stacked Argument Address (NSAA) will have a minimum alignment of
3668 8 bytes. */
3669
3670 static unsigned int
3671 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3672 {
3673 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3674 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3675 }
3676
3677 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3678
3679 static fixed_size_mode
3680 aarch64_get_reg_raw_mode (int regno)
3681 {
3682 if (TARGET_SVE && FP_REGNUM_P (regno))
3683 /* Don't use the SVE part of the register for __builtin_apply and
3684 __builtin_return. The SVE registers aren't used by the normal PCS,
3685 so using them there would be a waste of time. The PCS extensions
3686 for SVE types are fundamentally incompatible with the
3687 __builtin_return/__builtin_apply interface. */
3688 return as_a <fixed_size_mode> (V16QImode);
3689 return default_get_reg_raw_mode (regno);
3690 }
3691
3692 /* Implement TARGET_FUNCTION_ARG_PADDING.
3693
3694 Small aggregate types are placed in the lowest memory address.
3695
3696 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3697
3698 static pad_direction
3699 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3700 {
3701 /* On little-endian targets, the least significant byte of every stack
3702 argument is passed at the lowest byte address of the stack slot. */
3703 if (!BYTES_BIG_ENDIAN)
3704 return PAD_UPWARD;
3705
3706 /* Otherwise, integral, floating-point and pointer types are padded downward:
3707 the least significant byte of a stack argument is passed at the highest
3708 byte address of the stack slot. */
3709 if (type
3710 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3711 || POINTER_TYPE_P (type))
3712 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3713 return PAD_DOWNWARD;
3714
3715 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3716 return PAD_UPWARD;
3717 }
3718
3719 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3720
3721 It specifies padding for the last (may also be the only)
3722 element of a block move between registers and memory. If
3723 assuming the block is in the memory, padding upward means that
3724 the last element is padded after its highest significant byte,
3725 while in downward padding, the last element is padded at the
3726 its least significant byte side.
3727
3728 Small aggregates and small complex types are always padded
3729 upwards.
3730
3731 We don't need to worry about homogeneous floating-point or
3732 short-vector aggregates; their move is not affected by the
3733 padding direction determined here. Regardless of endianness,
3734 each element of such an aggregate is put in the least
3735 significant bits of a fp/simd register.
3736
3737 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3738 register has useful data, and return the opposite if the most
3739 significant byte does. */
3740
3741 bool
3742 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3743 bool first ATTRIBUTE_UNUSED)
3744 {
3745
3746 /* Small composite types are always padded upward. */
3747 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3748 {
3749 HOST_WIDE_INT size;
3750 if (type)
3751 size = int_size_in_bytes (type);
3752 else
3753 /* No frontends can create types with variable-sized modes, so we
3754 shouldn't be asked to pass or return them. */
3755 size = GET_MODE_SIZE (mode).to_constant ();
3756 if (size < 2 * UNITS_PER_WORD)
3757 return true;
3758 }
3759
3760 /* Otherwise, use the default padding. */
3761 return !BYTES_BIG_ENDIAN;
3762 }
3763
3764 static scalar_int_mode
3765 aarch64_libgcc_cmp_return_mode (void)
3766 {
3767 return SImode;
3768 }
3769
3770 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3771
3772 /* We use the 12-bit shifted immediate arithmetic instructions so values
3773 must be multiple of (1 << 12), i.e. 4096. */
3774 #define ARITH_FACTOR 4096
3775
3776 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3777 #error Cannot use simple address calculation for stack probing
3778 #endif
3779
3780 /* The pair of scratch registers used for stack probing. */
3781 #define PROBE_STACK_FIRST_REG 9
3782 #define PROBE_STACK_SECOND_REG 10
3783
3784 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3785 inclusive. These are offsets from the current stack pointer. */
3786
3787 static void
3788 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3789 {
3790 HOST_WIDE_INT size;
3791 if (!poly_size.is_constant (&size))
3792 {
3793 sorry ("stack probes for SVE frames");
3794 return;
3795 }
3796
3797 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3798
3799 /* See the same assertion on PROBE_INTERVAL above. */
3800 gcc_assert ((first % ARITH_FACTOR) == 0);
3801
3802 /* See if we have a constant small number of probes to generate. If so,
3803 that's the easy case. */
3804 if (size <= PROBE_INTERVAL)
3805 {
3806 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3807
3808 emit_set_insn (reg1,
3809 plus_constant (Pmode,
3810 stack_pointer_rtx, -(first + base)));
3811 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3812 }
3813
3814 /* The run-time loop is made up of 8 insns in the generic case while the
3815 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3816 else if (size <= 4 * PROBE_INTERVAL)
3817 {
3818 HOST_WIDE_INT i, rem;
3819
3820 emit_set_insn (reg1,
3821 plus_constant (Pmode,
3822 stack_pointer_rtx,
3823 -(first + PROBE_INTERVAL)));
3824 emit_stack_probe (reg1);
3825
3826 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3827 it exceeds SIZE. If only two probes are needed, this will not
3828 generate any code. Then probe at FIRST + SIZE. */
3829 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3830 {
3831 emit_set_insn (reg1,
3832 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3833 emit_stack_probe (reg1);
3834 }
3835
3836 rem = size - (i - PROBE_INTERVAL);
3837 if (rem > 256)
3838 {
3839 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3840
3841 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3842 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3843 }
3844 else
3845 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3846 }
3847
3848 /* Otherwise, do the same as above, but in a loop. Note that we must be
3849 extra careful with variables wrapping around because we might be at
3850 the very top (or the very bottom) of the address space and we have
3851 to be able to handle this case properly; in particular, we use an
3852 equality test for the loop condition. */
3853 else
3854 {
3855 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3856
3857 /* Step 1: round SIZE to the previous multiple of the interval. */
3858
3859 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3860
3861
3862 /* Step 2: compute initial and final value of the loop counter. */
3863
3864 /* TEST_ADDR = SP + FIRST. */
3865 emit_set_insn (reg1,
3866 plus_constant (Pmode, stack_pointer_rtx, -first));
3867
3868 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3869 HOST_WIDE_INT adjustment = - (first + rounded_size);
3870 if (! aarch64_uimm12_shift (adjustment))
3871 {
3872 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3873 true, Pmode);
3874 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3875 }
3876 else
3877 emit_set_insn (reg2,
3878 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3879
3880 /* Step 3: the loop
3881
3882 do
3883 {
3884 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3885 probe at TEST_ADDR
3886 }
3887 while (TEST_ADDR != LAST_ADDR)
3888
3889 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3890 until it is equal to ROUNDED_SIZE. */
3891
3892 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3893
3894
3895 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3896 that SIZE is equal to ROUNDED_SIZE. */
3897
3898 if (size != rounded_size)
3899 {
3900 HOST_WIDE_INT rem = size - rounded_size;
3901
3902 if (rem > 256)
3903 {
3904 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3905
3906 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3907 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3908 }
3909 else
3910 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3911 }
3912 }
3913
3914 /* Make sure nothing is scheduled before we are done. */
3915 emit_insn (gen_blockage ());
3916 }
3917
3918 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3919 absolute addresses. */
3920
3921 const char *
3922 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3923 {
3924 static int labelno = 0;
3925 char loop_lab[32];
3926 rtx xops[2];
3927
3928 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3929
3930 /* Loop. */
3931 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3932
3933 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3934 xops[0] = reg1;
3935 xops[1] = GEN_INT (PROBE_INTERVAL);
3936 output_asm_insn ("sub\t%0, %0, %1", xops);
3937
3938 /* Probe at TEST_ADDR. */
3939 output_asm_insn ("str\txzr, [%0]", xops);
3940
3941 /* Test if TEST_ADDR == LAST_ADDR. */
3942 xops[1] = reg2;
3943 output_asm_insn ("cmp\t%0, %1", xops);
3944
3945 /* Branch. */
3946 fputs ("\tb.ne\t", asm_out_file);
3947 assemble_name_raw (asm_out_file, loop_lab);
3948 fputc ('\n', asm_out_file);
3949
3950 return "";
3951 }
3952
3953 /* Mark the registers that need to be saved by the callee and calculate
3954 the size of the callee-saved registers area and frame record (both FP
3955 and LR may be omitted). */
3956 static void
3957 aarch64_layout_frame (void)
3958 {
3959 HOST_WIDE_INT offset = 0;
3960 int regno, last_fp_reg = INVALID_REGNUM;
3961
3962 if (reload_completed && cfun->machine->frame.laid_out)
3963 return;
3964
3965 /* Force a frame chain for EH returns so the return address is at FP+8. */
3966 cfun->machine->frame.emit_frame_chain
3967 = frame_pointer_needed || crtl->calls_eh_return;
3968
3969 /* Emit a frame chain if the frame pointer is enabled.
3970 If -momit-leaf-frame-pointer is used, do not use a frame chain
3971 in leaf functions which do not use LR. */
3972 if (flag_omit_frame_pointer == 2
3973 && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3974 && !df_regs_ever_live_p (LR_REGNUM)))
3975 cfun->machine->frame.emit_frame_chain = true;
3976
3977 #define SLOT_NOT_REQUIRED (-2)
3978 #define SLOT_REQUIRED (-1)
3979
3980 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3981 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3982
3983 /* First mark all the registers that really need to be saved... */
3984 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3985 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3986
3987 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3988 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3989
3990 /* ... that includes the eh data registers (if needed)... */
3991 if (crtl->calls_eh_return)
3992 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3993 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3994 = SLOT_REQUIRED;
3995
3996 /* ... and any callee saved register that dataflow says is live. */
3997 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3998 if (df_regs_ever_live_p (regno)
3999 && (regno == R30_REGNUM
4000 || !call_used_regs[regno]))
4001 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4002
4003 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4004 if (df_regs_ever_live_p (regno)
4005 && !call_used_regs[regno])
4006 {
4007 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4008 last_fp_reg = regno;
4009 }
4010
4011 if (cfun->machine->frame.emit_frame_chain)
4012 {
4013 /* FP and LR are placed in the linkage record. */
4014 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4015 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4016 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4017 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4018 offset = 2 * UNITS_PER_WORD;
4019 }
4020
4021 /* Now assign stack slots for them. */
4022 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4023 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4024 {
4025 cfun->machine->frame.reg_offset[regno] = offset;
4026 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4027 cfun->machine->frame.wb_candidate1 = regno;
4028 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4029 cfun->machine->frame.wb_candidate2 = regno;
4030 offset += UNITS_PER_WORD;
4031 }
4032
4033 HOST_WIDE_INT max_int_offset = offset;
4034 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4035 bool has_align_gap = offset != max_int_offset;
4036
4037 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4038 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4039 {
4040 /* If there is an alignment gap between integer and fp callee-saves,
4041 allocate the last fp register to it if possible. */
4042 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4043 {
4044 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4045 break;
4046 }
4047
4048 cfun->machine->frame.reg_offset[regno] = offset;
4049 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4050 cfun->machine->frame.wb_candidate1 = regno;
4051 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4052 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4053 cfun->machine->frame.wb_candidate2 = regno;
4054 offset += UNITS_PER_WORD;
4055 }
4056
4057 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4058
4059 cfun->machine->frame.saved_regs_size = offset;
4060
4061 HOST_WIDE_INT varargs_and_saved_regs_size
4062 = offset + cfun->machine->frame.saved_varargs_size;
4063
4064 cfun->machine->frame.hard_fp_offset
4065 = aligned_upper_bound (varargs_and_saved_regs_size
4066 + get_frame_size (),
4067 STACK_BOUNDARY / BITS_PER_UNIT);
4068
4069 /* Both these values are already aligned. */
4070 gcc_assert (multiple_p (crtl->outgoing_args_size,
4071 STACK_BOUNDARY / BITS_PER_UNIT));
4072 cfun->machine->frame.frame_size
4073 = (cfun->machine->frame.hard_fp_offset
4074 + crtl->outgoing_args_size);
4075
4076 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4077
4078 cfun->machine->frame.initial_adjust = 0;
4079 cfun->machine->frame.final_adjust = 0;
4080 cfun->machine->frame.callee_adjust = 0;
4081 cfun->machine->frame.callee_offset = 0;
4082
4083 HOST_WIDE_INT max_push_offset = 0;
4084 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4085 max_push_offset = 512;
4086 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4087 max_push_offset = 256;
4088
4089 HOST_WIDE_INT const_size, const_fp_offset;
4090 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4091 && const_size < max_push_offset
4092 && known_eq (crtl->outgoing_args_size, 0))
4093 {
4094 /* Simple, small frame with no outgoing arguments:
4095 stp reg1, reg2, [sp, -frame_size]!
4096 stp reg3, reg4, [sp, 16] */
4097 cfun->machine->frame.callee_adjust = const_size;
4098 }
4099 else if (known_lt (crtl->outgoing_args_size
4100 + cfun->machine->frame.saved_regs_size, 512)
4101 && !(cfun->calls_alloca
4102 && known_lt (cfun->machine->frame.hard_fp_offset,
4103 max_push_offset)))
4104 {
4105 /* Frame with small outgoing arguments:
4106 sub sp, sp, frame_size
4107 stp reg1, reg2, [sp, outgoing_args_size]
4108 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4109 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4110 cfun->machine->frame.callee_offset
4111 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4112 }
4113 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4114 && const_fp_offset < max_push_offset)
4115 {
4116 /* Frame with large outgoing arguments but a small local area:
4117 stp reg1, reg2, [sp, -hard_fp_offset]!
4118 stp reg3, reg4, [sp, 16]
4119 sub sp, sp, outgoing_args_size */
4120 cfun->machine->frame.callee_adjust = const_fp_offset;
4121 cfun->machine->frame.final_adjust
4122 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4123 }
4124 else
4125 {
4126 /* Frame with large local area and outgoing arguments using frame pointer:
4127 sub sp, sp, hard_fp_offset
4128 stp x29, x30, [sp, 0]
4129 add x29, sp, 0
4130 stp reg3, reg4, [sp, 16]
4131 sub sp, sp, outgoing_args_size */
4132 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4133 cfun->machine->frame.final_adjust
4134 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4135 }
4136
4137 cfun->machine->frame.laid_out = true;
4138 }
4139
4140 /* Return true if the register REGNO is saved on entry to
4141 the current function. */
4142
4143 static bool
4144 aarch64_register_saved_on_entry (int regno)
4145 {
4146 return cfun->machine->frame.reg_offset[regno] >= 0;
4147 }
4148
4149 /* Return the next register up from REGNO up to LIMIT for the callee
4150 to save. */
4151
4152 static unsigned
4153 aarch64_next_callee_save (unsigned regno, unsigned limit)
4154 {
4155 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4156 regno ++;
4157 return regno;
4158 }
4159
4160 /* Push the register number REGNO of mode MODE to the stack with write-back
4161 adjusting the stack by ADJUSTMENT. */
4162
4163 static void
4164 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4165 HOST_WIDE_INT adjustment)
4166 {
4167 rtx base_rtx = stack_pointer_rtx;
4168 rtx insn, reg, mem;
4169
4170 reg = gen_rtx_REG (mode, regno);
4171 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4172 plus_constant (Pmode, base_rtx, -adjustment));
4173 mem = gen_frame_mem (mode, mem);
4174
4175 insn = emit_move_insn (mem, reg);
4176 RTX_FRAME_RELATED_P (insn) = 1;
4177 }
4178
4179 /* Generate and return an instruction to store the pair of registers
4180 REG and REG2 of mode MODE to location BASE with write-back adjusting
4181 the stack location BASE by ADJUSTMENT. */
4182
4183 static rtx
4184 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4185 HOST_WIDE_INT adjustment)
4186 {
4187 switch (mode)
4188 {
4189 case E_DImode:
4190 return gen_storewb_pairdi_di (base, base, reg, reg2,
4191 GEN_INT (-adjustment),
4192 GEN_INT (UNITS_PER_WORD - adjustment));
4193 case E_DFmode:
4194 return gen_storewb_pairdf_di (base, base, reg, reg2,
4195 GEN_INT (-adjustment),
4196 GEN_INT (UNITS_PER_WORD - adjustment));
4197 default:
4198 gcc_unreachable ();
4199 }
4200 }
4201
4202 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4203 stack pointer by ADJUSTMENT. */
4204
4205 static void
4206 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4207 {
4208 rtx_insn *insn;
4209 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4210
4211 if (regno2 == INVALID_REGNUM)
4212 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4213
4214 rtx reg1 = gen_rtx_REG (mode, regno1);
4215 rtx reg2 = gen_rtx_REG (mode, regno2);
4216
4217 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4218 reg2, adjustment));
4219 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4220 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4221 RTX_FRAME_RELATED_P (insn) = 1;
4222 }
4223
4224 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4225 adjusting it by ADJUSTMENT afterwards. */
4226
4227 static rtx
4228 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4229 HOST_WIDE_INT adjustment)
4230 {
4231 switch (mode)
4232 {
4233 case E_DImode:
4234 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4235 GEN_INT (UNITS_PER_WORD));
4236 case E_DFmode:
4237 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4238 GEN_INT (UNITS_PER_WORD));
4239 default:
4240 gcc_unreachable ();
4241 }
4242 }
4243
4244 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4245 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4246 into CFI_OPS. */
4247
4248 static void
4249 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4250 rtx *cfi_ops)
4251 {
4252 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4253 rtx reg1 = gen_rtx_REG (mode, regno1);
4254
4255 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4256
4257 if (regno2 == INVALID_REGNUM)
4258 {
4259 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4260 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4261 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4262 }
4263 else
4264 {
4265 rtx reg2 = gen_rtx_REG (mode, regno2);
4266 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4267 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4268 reg2, adjustment));
4269 }
4270 }
4271
4272 /* Generate and return a store pair instruction of mode MODE to store
4273 register REG1 to MEM1 and register REG2 to MEM2. */
4274
4275 static rtx
4276 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4277 rtx reg2)
4278 {
4279 switch (mode)
4280 {
4281 case E_DImode:
4282 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4283
4284 case E_DFmode:
4285 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4286
4287 default:
4288 gcc_unreachable ();
4289 }
4290 }
4291
4292 /* Generate and regurn a load pair isntruction of mode MODE to load register
4293 REG1 from MEM1 and register REG2 from MEM2. */
4294
4295 static rtx
4296 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4297 rtx mem2)
4298 {
4299 switch (mode)
4300 {
4301 case E_DImode:
4302 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4303
4304 case E_DFmode:
4305 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4306
4307 default:
4308 gcc_unreachable ();
4309 }
4310 }
4311
4312 /* Return TRUE if return address signing should be enabled for the current
4313 function, otherwise return FALSE. */
4314
4315 bool
4316 aarch64_return_address_signing_enabled (void)
4317 {
4318 /* This function should only be called after frame laid out. */
4319 gcc_assert (cfun->machine->frame.laid_out);
4320
4321 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4322 if it's LR is pushed onto stack. */
4323 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4324 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4325 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4326 }
4327
4328 /* Emit code to save the callee-saved registers from register number START
4329 to LIMIT to the stack at the location starting at offset START_OFFSET,
4330 skipping any write-back candidates if SKIP_WB is true. */
4331
4332 static void
4333 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4334 unsigned start, unsigned limit, bool skip_wb)
4335 {
4336 rtx_insn *insn;
4337 unsigned regno;
4338 unsigned regno2;
4339
4340 for (regno = aarch64_next_callee_save (start, limit);
4341 regno <= limit;
4342 regno = aarch64_next_callee_save (regno + 1, limit))
4343 {
4344 rtx reg, mem;
4345 poly_int64 offset;
4346
4347 if (skip_wb
4348 && (regno == cfun->machine->frame.wb_candidate1
4349 || regno == cfun->machine->frame.wb_candidate2))
4350 continue;
4351
4352 if (cfun->machine->reg_is_wrapped_separately[regno])
4353 continue;
4354
4355 reg = gen_rtx_REG (mode, regno);
4356 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4357 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4358 offset));
4359
4360 regno2 = aarch64_next_callee_save (regno + 1, limit);
4361
4362 if (regno2 <= limit
4363 && !cfun->machine->reg_is_wrapped_separately[regno2]
4364 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4365 == cfun->machine->frame.reg_offset[regno2]))
4366
4367 {
4368 rtx reg2 = gen_rtx_REG (mode, regno2);
4369 rtx mem2;
4370
4371 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4372 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4373 offset));
4374 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4375 reg2));
4376
4377 /* The first part of a frame-related parallel insn is
4378 always assumed to be relevant to the frame
4379 calculations; subsequent parts, are only
4380 frame-related if explicitly marked. */
4381 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4382 regno = regno2;
4383 }
4384 else
4385 insn = emit_move_insn (mem, reg);
4386
4387 RTX_FRAME_RELATED_P (insn) = 1;
4388 }
4389 }
4390
4391 /* Emit code to restore the callee registers of mode MODE from register
4392 number START up to and including LIMIT. Restore from the stack offset
4393 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4394 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4395
4396 static void
4397 aarch64_restore_callee_saves (machine_mode mode,
4398 poly_int64 start_offset, unsigned start,
4399 unsigned limit, bool skip_wb, rtx *cfi_ops)
4400 {
4401 rtx base_rtx = stack_pointer_rtx;
4402 unsigned regno;
4403 unsigned regno2;
4404 poly_int64 offset;
4405
4406 for (regno = aarch64_next_callee_save (start, limit);
4407 regno <= limit;
4408 regno = aarch64_next_callee_save (regno + 1, limit))
4409 {
4410 if (cfun->machine->reg_is_wrapped_separately[regno])
4411 continue;
4412
4413 rtx reg, mem;
4414
4415 if (skip_wb
4416 && (regno == cfun->machine->frame.wb_candidate1
4417 || regno == cfun->machine->frame.wb_candidate2))
4418 continue;
4419
4420 reg = gen_rtx_REG (mode, regno);
4421 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4422 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4423
4424 regno2 = aarch64_next_callee_save (regno + 1, limit);
4425
4426 if (regno2 <= limit
4427 && !cfun->machine->reg_is_wrapped_separately[regno2]
4428 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4429 == cfun->machine->frame.reg_offset[regno2]))
4430 {
4431 rtx reg2 = gen_rtx_REG (mode, regno2);
4432 rtx mem2;
4433
4434 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4435 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4436 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4437
4438 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4439 regno = regno2;
4440 }
4441 else
4442 emit_move_insn (reg, mem);
4443 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4444 }
4445 }
4446
4447 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4448 of MODE. */
4449
4450 static inline bool
4451 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4452 {
4453 HOST_WIDE_INT multiple;
4454 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4455 && IN_RANGE (multiple, -8, 7));
4456 }
4457
4458 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4459 of MODE. */
4460
4461 static inline bool
4462 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4463 {
4464 HOST_WIDE_INT multiple;
4465 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4466 && IN_RANGE (multiple, 0, 63));
4467 }
4468
4469 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4470 of MODE. */
4471
4472 bool
4473 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4474 {
4475 HOST_WIDE_INT multiple;
4476 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4477 && IN_RANGE (multiple, -64, 63));
4478 }
4479
4480 /* Return true if OFFSET is a signed 9-bit value. */
4481
4482 static inline bool
4483 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4484 poly_int64 offset)
4485 {
4486 HOST_WIDE_INT const_offset;
4487 return (offset.is_constant (&const_offset)
4488 && IN_RANGE (const_offset, -256, 255));
4489 }
4490
4491 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4492 of MODE. */
4493
4494 static inline bool
4495 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4496 {
4497 HOST_WIDE_INT multiple;
4498 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4499 && IN_RANGE (multiple, -256, 255));
4500 }
4501
4502 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4503 of MODE. */
4504
4505 static inline bool
4506 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4507 {
4508 HOST_WIDE_INT multiple;
4509 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4510 && IN_RANGE (multiple, 0, 4095));
4511 }
4512
4513 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4514
4515 static sbitmap
4516 aarch64_get_separate_components (void)
4517 {
4518 aarch64_layout_frame ();
4519
4520 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4521 bitmap_clear (components);
4522
4523 /* The registers we need saved to the frame. */
4524 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4525 if (aarch64_register_saved_on_entry (regno))
4526 {
4527 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4528 if (!frame_pointer_needed)
4529 offset += cfun->machine->frame.frame_size
4530 - cfun->machine->frame.hard_fp_offset;
4531 /* Check that we can access the stack slot of the register with one
4532 direct load with no adjustments needed. */
4533 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4534 bitmap_set_bit (components, regno);
4535 }
4536
4537 /* Don't mess with the hard frame pointer. */
4538 if (frame_pointer_needed)
4539 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4540
4541 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4542 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4543 /* If aarch64_layout_frame has chosen registers to store/restore with
4544 writeback don't interfere with them to avoid having to output explicit
4545 stack adjustment instructions. */
4546 if (reg2 != INVALID_REGNUM)
4547 bitmap_clear_bit (components, reg2);
4548 if (reg1 != INVALID_REGNUM)
4549 bitmap_clear_bit (components, reg1);
4550
4551 bitmap_clear_bit (components, LR_REGNUM);
4552 bitmap_clear_bit (components, SP_REGNUM);
4553
4554 return components;
4555 }
4556
4557 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4558
4559 static sbitmap
4560 aarch64_components_for_bb (basic_block bb)
4561 {
4562 bitmap in = DF_LIVE_IN (bb);
4563 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4564 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4565
4566 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4567 bitmap_clear (components);
4568
4569 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4570 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4571 if ((!call_used_regs[regno])
4572 && (bitmap_bit_p (in, regno)
4573 || bitmap_bit_p (gen, regno)
4574 || bitmap_bit_p (kill, regno)))
4575 {
4576 unsigned regno2, offset, offset2;
4577 bitmap_set_bit (components, regno);
4578
4579 /* If there is a callee-save at an adjacent offset, add it too
4580 to increase the use of LDP/STP. */
4581 offset = cfun->machine->frame.reg_offset[regno];
4582 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4583
4584 if (regno2 <= LAST_SAVED_REGNUM)
4585 {
4586 offset2 = cfun->machine->frame.reg_offset[regno2];
4587 if ((offset & ~8) == (offset2 & ~8))
4588 bitmap_set_bit (components, regno2);
4589 }
4590 }
4591
4592 return components;
4593 }
4594
4595 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4596 Nothing to do for aarch64. */
4597
4598 static void
4599 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4600 {
4601 }
4602
4603 /* Return the next set bit in BMP from START onwards. Return the total number
4604 of bits in BMP if no set bit is found at or after START. */
4605
4606 static unsigned int
4607 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4608 {
4609 unsigned int nbits = SBITMAP_SIZE (bmp);
4610 if (start == nbits)
4611 return start;
4612
4613 gcc_assert (start < nbits);
4614 for (unsigned int i = start; i < nbits; i++)
4615 if (bitmap_bit_p (bmp, i))
4616 return i;
4617
4618 return nbits;
4619 }
4620
4621 /* Do the work for aarch64_emit_prologue_components and
4622 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4623 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4624 for these components or the epilogue sequence. That is, it determines
4625 whether we should emit stores or loads and what kind of CFA notes to attach
4626 to the insns. Otherwise the logic for the two sequences is very
4627 similar. */
4628
4629 static void
4630 aarch64_process_components (sbitmap components, bool prologue_p)
4631 {
4632 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4633 ? HARD_FRAME_POINTER_REGNUM
4634 : STACK_POINTER_REGNUM);
4635
4636 unsigned last_regno = SBITMAP_SIZE (components);
4637 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4638 rtx_insn *insn = NULL;
4639
4640 while (regno != last_regno)
4641 {
4642 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4643 so DFmode for the vector registers is enough. */
4644 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4645 rtx reg = gen_rtx_REG (mode, regno);
4646 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4647 if (!frame_pointer_needed)
4648 offset += cfun->machine->frame.frame_size
4649 - cfun->machine->frame.hard_fp_offset;
4650 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4651 rtx mem = gen_frame_mem (mode, addr);
4652
4653 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4654 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4655 /* No more registers to handle after REGNO.
4656 Emit a single save/restore and exit. */
4657 if (regno2 == last_regno)
4658 {
4659 insn = emit_insn (set);
4660 RTX_FRAME_RELATED_P (insn) = 1;
4661 if (prologue_p)
4662 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4663 else
4664 add_reg_note (insn, REG_CFA_RESTORE, reg);
4665 break;
4666 }
4667
4668 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4669 /* The next register is not of the same class or its offset is not
4670 mergeable with the current one into a pair. */
4671 if (!satisfies_constraint_Ump (mem)
4672 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4673 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4674 GET_MODE_SIZE (mode)))
4675 {
4676 insn = emit_insn (set);
4677 RTX_FRAME_RELATED_P (insn) = 1;
4678 if (prologue_p)
4679 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4680 else
4681 add_reg_note (insn, REG_CFA_RESTORE, reg);
4682
4683 regno = regno2;
4684 continue;
4685 }
4686
4687 /* REGNO2 can be saved/restored in a pair with REGNO. */
4688 rtx reg2 = gen_rtx_REG (mode, regno2);
4689 if (!frame_pointer_needed)
4690 offset2 += cfun->machine->frame.frame_size
4691 - cfun->machine->frame.hard_fp_offset;
4692 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4693 rtx mem2 = gen_frame_mem (mode, addr2);
4694 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4695 : gen_rtx_SET (reg2, mem2);
4696
4697 if (prologue_p)
4698 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4699 else
4700 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4701
4702 RTX_FRAME_RELATED_P (insn) = 1;
4703 if (prologue_p)
4704 {
4705 add_reg_note (insn, REG_CFA_OFFSET, set);
4706 add_reg_note (insn, REG_CFA_OFFSET, set2);
4707 }
4708 else
4709 {
4710 add_reg_note (insn, REG_CFA_RESTORE, reg);
4711 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4712 }
4713
4714 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4715 }
4716 }
4717
4718 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4719
4720 static void
4721 aarch64_emit_prologue_components (sbitmap components)
4722 {
4723 aarch64_process_components (components, true);
4724 }
4725
4726 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4727
4728 static void
4729 aarch64_emit_epilogue_components (sbitmap components)
4730 {
4731 aarch64_process_components (components, false);
4732 }
4733
4734 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4735
4736 static void
4737 aarch64_set_handled_components (sbitmap components)
4738 {
4739 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4740 if (bitmap_bit_p (components, regno))
4741 cfun->machine->reg_is_wrapped_separately[regno] = true;
4742 }
4743
4744 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4745 is saved at BASE + OFFSET. */
4746
4747 static void
4748 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4749 rtx base, poly_int64 offset)
4750 {
4751 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4752 add_reg_note (insn, REG_CFA_EXPRESSION,
4753 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4754 }
4755
4756 /* AArch64 stack frames generated by this compiler look like:
4757
4758 +-------------------------------+
4759 | |
4760 | incoming stack arguments |
4761 | |
4762 +-------------------------------+
4763 | | <-- incoming stack pointer (aligned)
4764 | callee-allocated save area |
4765 | for register varargs |
4766 | |
4767 +-------------------------------+
4768 | local variables | <-- frame_pointer_rtx
4769 | |
4770 +-------------------------------+
4771 | padding0 | \
4772 +-------------------------------+ |
4773 | callee-saved registers | | frame.saved_regs_size
4774 +-------------------------------+ |
4775 | LR' | |
4776 +-------------------------------+ |
4777 | FP' | / <- hard_frame_pointer_rtx (aligned)
4778 +-------------------------------+
4779 | dynamic allocation |
4780 +-------------------------------+
4781 | padding |
4782 +-------------------------------+
4783 | outgoing stack arguments | <-- arg_pointer
4784 | |
4785 +-------------------------------+
4786 | | <-- stack_pointer_rtx (aligned)
4787
4788 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4789 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4790 unchanged. */
4791
4792 /* Generate the prologue instructions for entry into a function.
4793 Establish the stack frame by decreasing the stack pointer with a
4794 properly calculated size and, if necessary, create a frame record
4795 filled with the values of LR and previous frame pointer. The
4796 current FP is also set up if it is in use. */
4797
4798 void
4799 aarch64_expand_prologue (void)
4800 {
4801 aarch64_layout_frame ();
4802
4803 poly_int64 frame_size = cfun->machine->frame.frame_size;
4804 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4805 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4806 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4807 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4808 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4809 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4810 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4811 rtx_insn *insn;
4812
4813 /* Sign return address for functions. */
4814 if (aarch64_return_address_signing_enabled ())
4815 {
4816 insn = emit_insn (gen_pacisp ());
4817 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4818 RTX_FRAME_RELATED_P (insn) = 1;
4819 }
4820
4821 if (flag_stack_usage_info)
4822 current_function_static_stack_size = constant_lower_bound (frame_size);
4823
4824 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4825 {
4826 if (crtl->is_leaf && !cfun->calls_alloca)
4827 {
4828 if (maybe_gt (frame_size, PROBE_INTERVAL)
4829 && maybe_gt (frame_size, get_stack_check_protect ()))
4830 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4831 (frame_size
4832 - get_stack_check_protect ()));
4833 }
4834 else if (maybe_gt (frame_size, 0))
4835 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4836 }
4837
4838 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4839 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4840
4841 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4842
4843 if (callee_adjust != 0)
4844 aarch64_push_regs (reg1, reg2, callee_adjust);
4845
4846 if (emit_frame_chain)
4847 {
4848 poly_int64 reg_offset = callee_adjust;
4849 if (callee_adjust == 0)
4850 {
4851 reg1 = R29_REGNUM;
4852 reg2 = R30_REGNUM;
4853 reg_offset = callee_offset;
4854 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4855 }
4856 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4857 stack_pointer_rtx, callee_offset,
4858 ip1_rtx, ip0_rtx, frame_pointer_needed);
4859 if (frame_pointer_needed && !frame_size.is_constant ())
4860 {
4861 /* Variable-sized frames need to describe the save slot
4862 address using DW_CFA_expression rather than DW_CFA_offset.
4863 This means that, without taking further action, the
4864 locations of the registers that we've already saved would
4865 remain based on the stack pointer even after we redefine
4866 the CFA based on the frame pointer. We therefore need new
4867 DW_CFA_expressions to re-express the save slots with addresses
4868 based on the frame pointer. */
4869 rtx_insn *insn = get_last_insn ();
4870 gcc_assert (RTX_FRAME_RELATED_P (insn));
4871
4872 /* Add an explicit CFA definition if this was previously
4873 implicit. */
4874 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4875 {
4876 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4877 callee_offset);
4878 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4879 gen_rtx_SET (hard_frame_pointer_rtx, src));
4880 }
4881
4882 /* Change the save slot expressions for the registers that
4883 we've already saved. */
4884 reg_offset -= callee_offset;
4885 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4886 reg_offset + UNITS_PER_WORD);
4887 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4888 reg_offset);
4889 }
4890 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4891 }
4892
4893 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4894 callee_adjust != 0 || emit_frame_chain);
4895 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4896 callee_adjust != 0 || emit_frame_chain);
4897 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4898 }
4899
4900 /* Return TRUE if we can use a simple_return insn.
4901
4902 This function checks whether the callee saved stack is empty, which
4903 means no restore actions are need. The pro_and_epilogue will use
4904 this to check whether shrink-wrapping opt is feasible. */
4905
4906 bool
4907 aarch64_use_return_insn_p (void)
4908 {
4909 if (!reload_completed)
4910 return false;
4911
4912 if (crtl->profile)
4913 return false;
4914
4915 aarch64_layout_frame ();
4916
4917 return known_eq (cfun->machine->frame.frame_size, 0);
4918 }
4919
4920 /* Generate the epilogue instructions for returning from a function.
4921 This is almost exactly the reverse of the prolog sequence, except
4922 that we need to insert barriers to avoid scheduling loads that read
4923 from a deallocated stack, and we optimize the unwind records by
4924 emitting them all together if possible. */
4925 void
4926 aarch64_expand_epilogue (bool for_sibcall)
4927 {
4928 aarch64_layout_frame ();
4929
4930 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4931 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4932 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4933 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4934 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4935 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4936 rtx cfi_ops = NULL;
4937 rtx_insn *insn;
4938 /* A stack clash protection prologue may not have left IP0_REGNUM or
4939 IP1_REGNUM in a usable state. The same is true for allocations
4940 with an SVE component, since we then need both temporary registers
4941 for each allocation. */
4942 bool can_inherit_p = (initial_adjust.is_constant ()
4943 && final_adjust.is_constant ()
4944 && !flag_stack_clash_protection);
4945
4946 /* We need to add memory barrier to prevent read from deallocated stack. */
4947 bool need_barrier_p
4948 = maybe_ne (get_frame_size ()
4949 + cfun->machine->frame.saved_varargs_size, 0);
4950
4951 /* Emit a barrier to prevent loads from a deallocated stack. */
4952 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4953 || cfun->calls_alloca
4954 || crtl->calls_eh_return)
4955 {
4956 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4957 need_barrier_p = false;
4958 }
4959
4960 /* Restore the stack pointer from the frame pointer if it may not
4961 be the same as the stack pointer. */
4962 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4963 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4964 if (frame_pointer_needed
4965 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4966 /* If writeback is used when restoring callee-saves, the CFA
4967 is restored on the instruction doing the writeback. */
4968 aarch64_add_offset (Pmode, stack_pointer_rtx,
4969 hard_frame_pointer_rtx, -callee_offset,
4970 ip1_rtx, ip0_rtx, callee_adjust == 0);
4971 else
4972 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4973 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4974
4975 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4976 callee_adjust != 0, &cfi_ops);
4977 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4978 callee_adjust != 0, &cfi_ops);
4979
4980 if (need_barrier_p)
4981 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4982
4983 if (callee_adjust != 0)
4984 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4985
4986 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4987 {
4988 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4989 insn = get_last_insn ();
4990 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4991 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4992 RTX_FRAME_RELATED_P (insn) = 1;
4993 cfi_ops = NULL;
4994 }
4995
4996 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4997 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4998
4999 if (cfi_ops)
5000 {
5001 /* Emit delayed restores and reset the CFA to be SP. */
5002 insn = get_last_insn ();
5003 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5004 REG_NOTES (insn) = cfi_ops;
5005 RTX_FRAME_RELATED_P (insn) = 1;
5006 }
5007
5008 /* We prefer to emit the combined return/authenticate instruction RETAA,
5009 however there are three cases in which we must instead emit an explicit
5010 authentication instruction.
5011
5012 1) Sibcalls don't return in a normal way, so if we're about to call one
5013 we must authenticate.
5014
5015 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5016 generating code for !TARGET_ARMV8_3 we can't use it and must
5017 explicitly authenticate.
5018
5019 3) On an eh_return path we make extra stack adjustments to update the
5020 canonical frame address to be the exception handler's CFA. We want
5021 to authenticate using the CFA of the function which calls eh_return.
5022 */
5023 if (aarch64_return_address_signing_enabled ()
5024 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5025 {
5026 insn = emit_insn (gen_autisp ());
5027 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5028 RTX_FRAME_RELATED_P (insn) = 1;
5029 }
5030
5031 /* Stack adjustment for exception handler. */
5032 if (crtl->calls_eh_return)
5033 {
5034 /* We need to unwind the stack by the offset computed by
5035 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5036 to be SP; letting the CFA move during this adjustment
5037 is just as correct as retaining the CFA from the body
5038 of the function. Therefore, do nothing special. */
5039 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5040 }
5041
5042 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5043 if (!for_sibcall)
5044 emit_jump_insn (ret_rtx);
5045 }
5046
5047 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5048 normally or return to a previous frame after unwinding.
5049
5050 An EH return uses a single shared return sequence. The epilogue is
5051 exactly like a normal epilogue except that it has an extra input
5052 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5053 that must be applied after the frame has been destroyed. An extra label
5054 is inserted before the epilogue which initializes this register to zero,
5055 and this is the entry point for a normal return.
5056
5057 An actual EH return updates the return address, initializes the stack
5058 adjustment and jumps directly into the epilogue (bypassing the zeroing
5059 of the adjustment). Since the return address is typically saved on the
5060 stack when a function makes a call, the saved LR must be updated outside
5061 the epilogue.
5062
5063 This poses problems as the store is generated well before the epilogue,
5064 so the offset of LR is not known yet. Also optimizations will remove the
5065 store as it appears dead, even after the epilogue is generated (as the
5066 base or offset for loading LR is different in many cases).
5067
5068 To avoid these problems this implementation forces the frame pointer
5069 in eh_return functions so that the location of LR is fixed and known early.
5070 It also marks the store volatile, so no optimization is permitted to
5071 remove the store. */
5072 rtx
5073 aarch64_eh_return_handler_rtx (void)
5074 {
5075 rtx tmp = gen_frame_mem (Pmode,
5076 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5077
5078 /* Mark the store volatile, so no optimization is permitted to remove it. */
5079 MEM_VOLATILE_P (tmp) = true;
5080 return tmp;
5081 }
5082
5083 /* Output code to add DELTA to the first argument, and then jump
5084 to FUNCTION. Used for C++ multiple inheritance. */
5085 static void
5086 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5087 HOST_WIDE_INT delta,
5088 HOST_WIDE_INT vcall_offset,
5089 tree function)
5090 {
5091 /* The this pointer is always in x0. Note that this differs from
5092 Arm where the this pointer maybe bumped to r1 if r0 is required
5093 to return a pointer to an aggregate. On AArch64 a result value
5094 pointer will be in x8. */
5095 int this_regno = R0_REGNUM;
5096 rtx this_rtx, temp0, temp1, addr, funexp;
5097 rtx_insn *insn;
5098
5099 reload_completed = 1;
5100 emit_note (NOTE_INSN_PROLOGUE_END);
5101
5102 this_rtx = gen_rtx_REG (Pmode, this_regno);
5103 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5104 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5105
5106 if (vcall_offset == 0)
5107 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5108 else
5109 {
5110 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5111
5112 addr = this_rtx;
5113 if (delta != 0)
5114 {
5115 if (delta >= -256 && delta < 256)
5116 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5117 plus_constant (Pmode, this_rtx, delta));
5118 else
5119 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5120 temp1, temp0, false);
5121 }
5122
5123 if (Pmode == ptr_mode)
5124 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5125 else
5126 aarch64_emit_move (temp0,
5127 gen_rtx_ZERO_EXTEND (Pmode,
5128 gen_rtx_MEM (ptr_mode, addr)));
5129
5130 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5131 addr = plus_constant (Pmode, temp0, vcall_offset);
5132 else
5133 {
5134 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5135 Pmode);
5136 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5137 }
5138
5139 if (Pmode == ptr_mode)
5140 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5141 else
5142 aarch64_emit_move (temp1,
5143 gen_rtx_SIGN_EXTEND (Pmode,
5144 gen_rtx_MEM (ptr_mode, addr)));
5145
5146 emit_insn (gen_add2_insn (this_rtx, temp1));
5147 }
5148
5149 /* Generate a tail call to the target function. */
5150 if (!TREE_USED (function))
5151 {
5152 assemble_external (function);
5153 TREE_USED (function) = 1;
5154 }
5155 funexp = XEXP (DECL_RTL (function), 0);
5156 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5157 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5158 SIBLING_CALL_P (insn) = 1;
5159
5160 insn = get_insns ();
5161 shorten_branches (insn);
5162 final_start_function (insn, file, 1);
5163 final (insn, file, 1);
5164 final_end_function ();
5165
5166 /* Stop pretending to be a post-reload pass. */
5167 reload_completed = 0;
5168 }
5169
5170 static bool
5171 aarch64_tls_referenced_p (rtx x)
5172 {
5173 if (!TARGET_HAVE_TLS)
5174 return false;
5175 subrtx_iterator::array_type array;
5176 FOR_EACH_SUBRTX (iter, array, x, ALL)
5177 {
5178 const_rtx x = *iter;
5179 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5180 return true;
5181 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5182 TLS offsets, not real symbol references. */
5183 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5184 iter.skip_subrtxes ();
5185 }
5186 return false;
5187 }
5188
5189
5190 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5191 a left shift of 0 or 12 bits. */
5192 bool
5193 aarch64_uimm12_shift (HOST_WIDE_INT val)
5194 {
5195 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5196 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5197 );
5198 }
5199
5200
5201 /* Return true if val is an immediate that can be loaded into a
5202 register by a MOVZ instruction. */
5203 static bool
5204 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5205 {
5206 if (GET_MODE_SIZE (mode) > 4)
5207 {
5208 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5209 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5210 return 1;
5211 }
5212 else
5213 {
5214 /* Ignore sign extension. */
5215 val &= (HOST_WIDE_INT) 0xffffffff;
5216 }
5217 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5218 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5219 }
5220
5221 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5222 64-bit (DImode) integer. */
5223
5224 static unsigned HOST_WIDE_INT
5225 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5226 {
5227 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5228 while (size < 64)
5229 {
5230 val &= (HOST_WIDE_INT_1U << size) - 1;
5231 val |= val << size;
5232 size *= 2;
5233 }
5234 return val;
5235 }
5236
5237 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5238
5239 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5240 {
5241 0x0000000100000001ull,
5242 0x0001000100010001ull,
5243 0x0101010101010101ull,
5244 0x1111111111111111ull,
5245 0x5555555555555555ull,
5246 };
5247
5248
5249 /* Return true if val is a valid bitmask immediate. */
5250
5251 bool
5252 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5253 {
5254 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5255 int bits;
5256
5257 /* Check for a single sequence of one bits and return quickly if so.
5258 The special cases of all ones and all zeroes returns false. */
5259 val = aarch64_replicate_bitmask_imm (val_in, mode);
5260 tmp = val + (val & -val);
5261
5262 if (tmp == (tmp & -tmp))
5263 return (val + 1) > 1;
5264
5265 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5266 if (mode == SImode)
5267 val = (val << 32) | (val & 0xffffffff);
5268
5269 /* Invert if the immediate doesn't start with a zero bit - this means we
5270 only need to search for sequences of one bits. */
5271 if (val & 1)
5272 val = ~val;
5273
5274 /* Find the first set bit and set tmp to val with the first sequence of one
5275 bits removed. Return success if there is a single sequence of ones. */
5276 first_one = val & -val;
5277 tmp = val & (val + first_one);
5278
5279 if (tmp == 0)
5280 return true;
5281
5282 /* Find the next set bit and compute the difference in bit position. */
5283 next_one = tmp & -tmp;
5284 bits = clz_hwi (first_one) - clz_hwi (next_one);
5285 mask = val ^ tmp;
5286
5287 /* Check the bit position difference is a power of 2, and that the first
5288 sequence of one bits fits within 'bits' bits. */
5289 if ((mask >> bits) != 0 || bits != (bits & -bits))
5290 return false;
5291
5292 /* Check the sequence of one bits is repeated 64/bits times. */
5293 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5294 }
5295
5296 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5297 Assumed precondition: VAL_IN Is not zero. */
5298
5299 unsigned HOST_WIDE_INT
5300 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5301 {
5302 int lowest_bit_set = ctz_hwi (val_in);
5303 int highest_bit_set = floor_log2 (val_in);
5304 gcc_assert (val_in != 0);
5305
5306 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5307 (HOST_WIDE_INT_1U << lowest_bit_set));
5308 }
5309
5310 /* Create constant where bits outside of lowest bit set to highest bit set
5311 are set to 1. */
5312
5313 unsigned HOST_WIDE_INT
5314 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5315 {
5316 return val_in | ~aarch64_and_split_imm1 (val_in);
5317 }
5318
5319 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5320
5321 bool
5322 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5323 {
5324 scalar_int_mode int_mode;
5325 if (!is_a <scalar_int_mode> (mode, &int_mode))
5326 return false;
5327
5328 if (aarch64_bitmask_imm (val_in, int_mode))
5329 return false;
5330
5331 if (aarch64_move_imm (val_in, int_mode))
5332 return false;
5333
5334 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5335
5336 return aarch64_bitmask_imm (imm2, int_mode);
5337 }
5338
5339 /* Return true if val is an immediate that can be loaded into a
5340 register in a single instruction. */
5341 bool
5342 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5343 {
5344 scalar_int_mode int_mode;
5345 if (!is_a <scalar_int_mode> (mode, &int_mode))
5346 return false;
5347
5348 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5349 return 1;
5350 return aarch64_bitmask_imm (val, int_mode);
5351 }
5352
5353 static bool
5354 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5355 {
5356 rtx base, offset;
5357
5358 if (GET_CODE (x) == HIGH)
5359 return true;
5360
5361 /* There's no way to calculate VL-based values using relocations. */
5362 subrtx_iterator::array_type array;
5363 FOR_EACH_SUBRTX (iter, array, x, ALL)
5364 if (GET_CODE (*iter) == CONST_POLY_INT)
5365 return true;
5366
5367 split_const (x, &base, &offset);
5368 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5369 {
5370 if (aarch64_classify_symbol (base, INTVAL (offset))
5371 != SYMBOL_FORCE_TO_MEM)
5372 return true;
5373 else
5374 /* Avoid generating a 64-bit relocation in ILP32; leave
5375 to aarch64_expand_mov_immediate to handle it properly. */
5376 return mode != ptr_mode;
5377 }
5378
5379 return aarch64_tls_referenced_p (x);
5380 }
5381
5382 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5383 The expansion for a table switch is quite expensive due to the number
5384 of instructions, the table lookup and hard to predict indirect jump.
5385 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5386 set, otherwise use tables for > 16 cases as a tradeoff between size and
5387 performance. When optimizing for size, use the default setting. */
5388
5389 static unsigned int
5390 aarch64_case_values_threshold (void)
5391 {
5392 /* Use the specified limit for the number of cases before using jump
5393 tables at higher optimization levels. */
5394 if (optimize > 2
5395 && selected_cpu->tune->max_case_values != 0)
5396 return selected_cpu->tune->max_case_values;
5397 else
5398 return optimize_size ? default_case_values_threshold () : 17;
5399 }
5400
5401 /* Return true if register REGNO is a valid index register.
5402 STRICT_P is true if REG_OK_STRICT is in effect. */
5403
5404 bool
5405 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5406 {
5407 if (!HARD_REGISTER_NUM_P (regno))
5408 {
5409 if (!strict_p)
5410 return true;
5411
5412 if (!reg_renumber)
5413 return false;
5414
5415 regno = reg_renumber[regno];
5416 }
5417 return GP_REGNUM_P (regno);
5418 }
5419
5420 /* Return true if register REGNO is a valid base register for mode MODE.
5421 STRICT_P is true if REG_OK_STRICT is in effect. */
5422
5423 bool
5424 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5425 {
5426 if (!HARD_REGISTER_NUM_P (regno))
5427 {
5428 if (!strict_p)
5429 return true;
5430
5431 if (!reg_renumber)
5432 return false;
5433
5434 regno = reg_renumber[regno];
5435 }
5436
5437 /* The fake registers will be eliminated to either the stack or
5438 hard frame pointer, both of which are usually valid base registers.
5439 Reload deals with the cases where the eliminated form isn't valid. */
5440 return (GP_REGNUM_P (regno)
5441 || regno == SP_REGNUM
5442 || regno == FRAME_POINTER_REGNUM
5443 || regno == ARG_POINTER_REGNUM);
5444 }
5445
5446 /* Return true if X is a valid base register for mode MODE.
5447 STRICT_P is true if REG_OK_STRICT is in effect. */
5448
5449 static bool
5450 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5451 {
5452 if (!strict_p
5453 && GET_CODE (x) == SUBREG
5454 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5455 x = SUBREG_REG (x);
5456
5457 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5458 }
5459
5460 /* Return true if address offset is a valid index. If it is, fill in INFO
5461 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5462
5463 static bool
5464 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5465 machine_mode mode, bool strict_p)
5466 {
5467 enum aarch64_address_type type;
5468 rtx index;
5469 int shift;
5470
5471 /* (reg:P) */
5472 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5473 && GET_MODE (x) == Pmode)
5474 {
5475 type = ADDRESS_REG_REG;
5476 index = x;
5477 shift = 0;
5478 }
5479 /* (sign_extend:DI (reg:SI)) */
5480 else if ((GET_CODE (x) == SIGN_EXTEND
5481 || GET_CODE (x) == ZERO_EXTEND)
5482 && GET_MODE (x) == DImode
5483 && GET_MODE (XEXP (x, 0)) == SImode)
5484 {
5485 type = (GET_CODE (x) == SIGN_EXTEND)
5486 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5487 index = XEXP (x, 0);
5488 shift = 0;
5489 }
5490 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5491 else if (GET_CODE (x) == MULT
5492 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5493 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5494 && GET_MODE (XEXP (x, 0)) == DImode
5495 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5496 && CONST_INT_P (XEXP (x, 1)))
5497 {
5498 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5499 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5500 index = XEXP (XEXP (x, 0), 0);
5501 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5502 }
5503 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5504 else if (GET_CODE (x) == ASHIFT
5505 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5506 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5507 && GET_MODE (XEXP (x, 0)) == DImode
5508 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5509 && CONST_INT_P (XEXP (x, 1)))
5510 {
5511 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5512 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5513 index = XEXP (XEXP (x, 0), 0);
5514 shift = INTVAL (XEXP (x, 1));
5515 }
5516 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5517 else if ((GET_CODE (x) == SIGN_EXTRACT
5518 || GET_CODE (x) == ZERO_EXTRACT)
5519 && GET_MODE (x) == DImode
5520 && GET_CODE (XEXP (x, 0)) == MULT
5521 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5522 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5523 {
5524 type = (GET_CODE (x) == SIGN_EXTRACT)
5525 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5526 index = XEXP (XEXP (x, 0), 0);
5527 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5528 if (INTVAL (XEXP (x, 1)) != 32 + shift
5529 || INTVAL (XEXP (x, 2)) != 0)
5530 shift = -1;
5531 }
5532 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5533 (const_int 0xffffffff<<shift)) */
5534 else if (GET_CODE (x) == AND
5535 && GET_MODE (x) == DImode
5536 && GET_CODE (XEXP (x, 0)) == MULT
5537 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5538 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5539 && CONST_INT_P (XEXP (x, 1)))
5540 {
5541 type = ADDRESS_REG_UXTW;
5542 index = XEXP (XEXP (x, 0), 0);
5543 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5544 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5545 shift = -1;
5546 }
5547 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5548 else if ((GET_CODE (x) == SIGN_EXTRACT
5549 || GET_CODE (x) == ZERO_EXTRACT)
5550 && GET_MODE (x) == DImode
5551 && GET_CODE (XEXP (x, 0)) == ASHIFT
5552 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5553 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5554 {
5555 type = (GET_CODE (x) == SIGN_EXTRACT)
5556 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5557 index = XEXP (XEXP (x, 0), 0);
5558 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5559 if (INTVAL (XEXP (x, 1)) != 32 + shift
5560 || INTVAL (XEXP (x, 2)) != 0)
5561 shift = -1;
5562 }
5563 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5564 (const_int 0xffffffff<<shift)) */
5565 else if (GET_CODE (x) == AND
5566 && GET_MODE (x) == DImode
5567 && GET_CODE (XEXP (x, 0)) == ASHIFT
5568 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5569 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5570 && CONST_INT_P (XEXP (x, 1)))
5571 {
5572 type = ADDRESS_REG_UXTW;
5573 index = XEXP (XEXP (x, 0), 0);
5574 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5575 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5576 shift = -1;
5577 }
5578 /* (mult:P (reg:P) (const_int scale)) */
5579 else if (GET_CODE (x) == MULT
5580 && GET_MODE (x) == Pmode
5581 && GET_MODE (XEXP (x, 0)) == Pmode
5582 && CONST_INT_P (XEXP (x, 1)))
5583 {
5584 type = ADDRESS_REG_REG;
5585 index = XEXP (x, 0);
5586 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5587 }
5588 /* (ashift:P (reg:P) (const_int shift)) */
5589 else if (GET_CODE (x) == ASHIFT
5590 && GET_MODE (x) == Pmode
5591 && GET_MODE (XEXP (x, 0)) == Pmode
5592 && CONST_INT_P (XEXP (x, 1)))
5593 {
5594 type = ADDRESS_REG_REG;
5595 index = XEXP (x, 0);
5596 shift = INTVAL (XEXP (x, 1));
5597 }
5598 else
5599 return false;
5600
5601 if (!strict_p
5602 && GET_CODE (index) == SUBREG
5603 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5604 index = SUBREG_REG (index);
5605
5606 if (aarch64_sve_data_mode_p (mode))
5607 {
5608 if (type != ADDRESS_REG_REG
5609 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5610 return false;
5611 }
5612 else
5613 {
5614 if (shift != 0
5615 && !(IN_RANGE (shift, 1, 3)
5616 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5617 return false;
5618 }
5619
5620 if (REG_P (index)
5621 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5622 {
5623 info->type = type;
5624 info->offset = index;
5625 info->shift = shift;
5626 return true;
5627 }
5628
5629 return false;
5630 }
5631
5632 /* Return true if MODE is one of the modes for which we
5633 support LDP/STP operations. */
5634
5635 static bool
5636 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5637 {
5638 return mode == SImode || mode == DImode
5639 || mode == SFmode || mode == DFmode
5640 || (aarch64_vector_mode_supported_p (mode)
5641 && known_eq (GET_MODE_SIZE (mode), 8));
5642 }
5643
5644 /* Return true if REGNO is a virtual pointer register, or an eliminable
5645 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5646 include stack_pointer or hard_frame_pointer. */
5647 static bool
5648 virt_or_elim_regno_p (unsigned regno)
5649 {
5650 return ((regno >= FIRST_VIRTUAL_REGISTER
5651 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5652 || regno == FRAME_POINTER_REGNUM
5653 || regno == ARG_POINTER_REGNUM);
5654 }
5655
5656 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5657 If it is, fill in INFO appropriately. STRICT_P is true if
5658 REG_OK_STRICT is in effect. */
5659
5660 static bool
5661 aarch64_classify_address (struct aarch64_address_info *info,
5662 rtx x, machine_mode mode, bool strict_p,
5663 aarch64_addr_query_type type = ADDR_QUERY_M)
5664 {
5665 enum rtx_code code = GET_CODE (x);
5666 rtx op0, op1;
5667 poly_int64 offset;
5668
5669 HOST_WIDE_INT const_size;
5670
5671 /* On BE, we use load/store pair for all large int mode load/stores.
5672 TI/TFmode may also use a load/store pair. */
5673 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5674 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5675 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5676 || mode == TImode
5677 || mode == TFmode
5678 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5679
5680 bool allow_reg_index_p = (!load_store_pair_p
5681 && (known_lt (GET_MODE_SIZE (mode), 16)
5682 || vec_flags == VEC_ADVSIMD
5683 || vec_flags == VEC_SVE_DATA));
5684
5685 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5686 [Rn, #offset, MUL VL]. */
5687 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5688 && (code != REG && code != PLUS))
5689 return false;
5690
5691 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5692 REG addressing. */
5693 if (advsimd_struct_p
5694 && !BYTES_BIG_ENDIAN
5695 && (code != POST_INC && code != REG))
5696 return false;
5697
5698 gcc_checking_assert (GET_MODE (x) == VOIDmode
5699 || SCALAR_INT_MODE_P (GET_MODE (x)));
5700
5701 switch (code)
5702 {
5703 case REG:
5704 case SUBREG:
5705 info->type = ADDRESS_REG_IMM;
5706 info->base = x;
5707 info->offset = const0_rtx;
5708 info->const_offset = 0;
5709 return aarch64_base_register_rtx_p (x, strict_p);
5710
5711 case PLUS:
5712 op0 = XEXP (x, 0);
5713 op1 = XEXP (x, 1);
5714
5715 if (! strict_p
5716 && REG_P (op0)
5717 && virt_or_elim_regno_p (REGNO (op0))
5718 && poly_int_rtx_p (op1, &offset))
5719 {
5720 info->type = ADDRESS_REG_IMM;
5721 info->base = op0;
5722 info->offset = op1;
5723 info->const_offset = offset;
5724
5725 return true;
5726 }
5727
5728 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5729 && aarch64_base_register_rtx_p (op0, strict_p)
5730 && poly_int_rtx_p (op1, &offset))
5731 {
5732 info->type = ADDRESS_REG_IMM;
5733 info->base = op0;
5734 info->offset = op1;
5735 info->const_offset = offset;
5736
5737 /* TImode and TFmode values are allowed in both pairs of X
5738 registers and individual Q registers. The available
5739 address modes are:
5740 X,X: 7-bit signed scaled offset
5741 Q: 9-bit signed offset
5742 We conservatively require an offset representable in either mode.
5743 When performing the check for pairs of X registers i.e. LDP/STP
5744 pass down DImode since that is the natural size of the LDP/STP
5745 instruction memory accesses. */
5746 if (mode == TImode || mode == TFmode)
5747 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5748 && (offset_9bit_signed_unscaled_p (mode, offset)
5749 || offset_12bit_unsigned_scaled_p (mode, offset)));
5750
5751 /* A 7bit offset check because OImode will emit a ldp/stp
5752 instruction (only big endian will get here).
5753 For ldp/stp instructions, the offset is scaled for the size of a
5754 single element of the pair. */
5755 if (mode == OImode)
5756 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5757
5758 /* Three 9/12 bit offsets checks because CImode will emit three
5759 ldr/str instructions (only big endian will get here). */
5760 if (mode == CImode)
5761 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5762 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5763 || offset_12bit_unsigned_scaled_p (V16QImode,
5764 offset + 32)));
5765
5766 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5767 instructions (only big endian will get here). */
5768 if (mode == XImode)
5769 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5770 && aarch64_offset_7bit_signed_scaled_p (TImode,
5771 offset + 32));
5772
5773 /* Make "m" use the LD1 offset range for SVE data modes, so
5774 that pre-RTL optimizers like ivopts will work to that
5775 instead of the wider LDR/STR range. */
5776 if (vec_flags == VEC_SVE_DATA)
5777 return (type == ADDR_QUERY_M
5778 ? offset_4bit_signed_scaled_p (mode, offset)
5779 : offset_9bit_signed_scaled_p (mode, offset));
5780
5781 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5782 {
5783 poly_int64 end_offset = (offset
5784 + GET_MODE_SIZE (mode)
5785 - BYTES_PER_SVE_VECTOR);
5786 return (type == ADDR_QUERY_M
5787 ? offset_4bit_signed_scaled_p (mode, offset)
5788 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5789 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5790 end_offset)));
5791 }
5792
5793 if (vec_flags == VEC_SVE_PRED)
5794 return offset_9bit_signed_scaled_p (mode, offset);
5795
5796 if (load_store_pair_p)
5797 return ((known_eq (GET_MODE_SIZE (mode), 4)
5798 || known_eq (GET_MODE_SIZE (mode), 8))
5799 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5800 else
5801 return (offset_9bit_signed_unscaled_p (mode, offset)
5802 || offset_12bit_unsigned_scaled_p (mode, offset));
5803 }
5804
5805 if (allow_reg_index_p)
5806 {
5807 /* Look for base + (scaled/extended) index register. */
5808 if (aarch64_base_register_rtx_p (op0, strict_p)
5809 && aarch64_classify_index (info, op1, mode, strict_p))
5810 {
5811 info->base = op0;
5812 return true;
5813 }
5814 if (aarch64_base_register_rtx_p (op1, strict_p)
5815 && aarch64_classify_index (info, op0, mode, strict_p))
5816 {
5817 info->base = op1;
5818 return true;
5819 }
5820 }
5821
5822 return false;
5823
5824 case POST_INC:
5825 case POST_DEC:
5826 case PRE_INC:
5827 case PRE_DEC:
5828 info->type = ADDRESS_REG_WB;
5829 info->base = XEXP (x, 0);
5830 info->offset = NULL_RTX;
5831 return aarch64_base_register_rtx_p (info->base, strict_p);
5832
5833 case POST_MODIFY:
5834 case PRE_MODIFY:
5835 info->type = ADDRESS_REG_WB;
5836 info->base = XEXP (x, 0);
5837 if (GET_CODE (XEXP (x, 1)) == PLUS
5838 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5839 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5840 && aarch64_base_register_rtx_p (info->base, strict_p))
5841 {
5842 info->offset = XEXP (XEXP (x, 1), 1);
5843 info->const_offset = offset;
5844
5845 /* TImode and TFmode values are allowed in both pairs of X
5846 registers and individual Q registers. The available
5847 address modes are:
5848 X,X: 7-bit signed scaled offset
5849 Q: 9-bit signed offset
5850 We conservatively require an offset representable in either mode.
5851 */
5852 if (mode == TImode || mode == TFmode)
5853 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5854 && offset_9bit_signed_unscaled_p (mode, offset));
5855
5856 if (load_store_pair_p)
5857 return ((known_eq (GET_MODE_SIZE (mode), 4)
5858 || known_eq (GET_MODE_SIZE (mode), 8))
5859 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5860 else
5861 return offset_9bit_signed_unscaled_p (mode, offset);
5862 }
5863 return false;
5864
5865 case CONST:
5866 case SYMBOL_REF:
5867 case LABEL_REF:
5868 /* load literal: pc-relative constant pool entry. Only supported
5869 for SI mode or larger. */
5870 info->type = ADDRESS_SYMBOLIC;
5871
5872 if (!load_store_pair_p
5873 && GET_MODE_SIZE (mode).is_constant (&const_size)
5874 && const_size >= 4)
5875 {
5876 rtx sym, addend;
5877
5878 split_const (x, &sym, &addend);
5879 return ((GET_CODE (sym) == LABEL_REF
5880 || (GET_CODE (sym) == SYMBOL_REF
5881 && CONSTANT_POOL_ADDRESS_P (sym)
5882 && aarch64_pcrelative_literal_loads)));
5883 }
5884 return false;
5885
5886 case LO_SUM:
5887 info->type = ADDRESS_LO_SUM;
5888 info->base = XEXP (x, 0);
5889 info->offset = XEXP (x, 1);
5890 if (allow_reg_index_p
5891 && aarch64_base_register_rtx_p (info->base, strict_p))
5892 {
5893 rtx sym, offs;
5894 split_const (info->offset, &sym, &offs);
5895 if (GET_CODE (sym) == SYMBOL_REF
5896 && (aarch64_classify_symbol (sym, INTVAL (offs))
5897 == SYMBOL_SMALL_ABSOLUTE))
5898 {
5899 /* The symbol and offset must be aligned to the access size. */
5900 unsigned int align;
5901
5902 if (CONSTANT_POOL_ADDRESS_P (sym))
5903 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5904 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5905 {
5906 tree exp = SYMBOL_REF_DECL (sym);
5907 align = TYPE_ALIGN (TREE_TYPE (exp));
5908 align = aarch64_constant_alignment (exp, align);
5909 }
5910 else if (SYMBOL_REF_DECL (sym))
5911 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5912 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5913 && SYMBOL_REF_BLOCK (sym) != NULL)
5914 align = SYMBOL_REF_BLOCK (sym)->alignment;
5915 else
5916 align = BITS_PER_UNIT;
5917
5918 poly_int64 ref_size = GET_MODE_SIZE (mode);
5919 if (known_eq (ref_size, 0))
5920 ref_size = GET_MODE_SIZE (DImode);
5921
5922 return (multiple_p (INTVAL (offs), ref_size)
5923 && multiple_p (align / BITS_PER_UNIT, ref_size));
5924 }
5925 }
5926 return false;
5927
5928 default:
5929 return false;
5930 }
5931 }
5932
5933 /* Return true if the address X is valid for a PRFM instruction.
5934 STRICT_P is true if we should do strict checking with
5935 aarch64_classify_address. */
5936
5937 bool
5938 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5939 {
5940 struct aarch64_address_info addr;
5941
5942 /* PRFM accepts the same addresses as DImode... */
5943 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5944 if (!res)
5945 return false;
5946
5947 /* ... except writeback forms. */
5948 return addr.type != ADDRESS_REG_WB;
5949 }
5950
5951 bool
5952 aarch64_symbolic_address_p (rtx x)
5953 {
5954 rtx offset;
5955
5956 split_const (x, &x, &offset);
5957 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5958 }
5959
5960 /* Classify the base of symbolic expression X. */
5961
5962 enum aarch64_symbol_type
5963 aarch64_classify_symbolic_expression (rtx x)
5964 {
5965 rtx offset;
5966
5967 split_const (x, &x, &offset);
5968 return aarch64_classify_symbol (x, INTVAL (offset));
5969 }
5970
5971
5972 /* Return TRUE if X is a legitimate address for accessing memory in
5973 mode MODE. */
5974 static bool
5975 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5976 {
5977 struct aarch64_address_info addr;
5978
5979 return aarch64_classify_address (&addr, x, mode, strict_p);
5980 }
5981
5982 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5983 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5984 bool
5985 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5986 aarch64_addr_query_type type)
5987 {
5988 struct aarch64_address_info addr;
5989
5990 return aarch64_classify_address (&addr, x, mode, strict_p, type);
5991 }
5992
5993 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
5994
5995 static bool
5996 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5997 poly_int64 orig_offset,
5998 machine_mode mode)
5999 {
6000 HOST_WIDE_INT size;
6001 if (GET_MODE_SIZE (mode).is_constant (&size))
6002 {
6003 HOST_WIDE_INT const_offset, second_offset;
6004
6005 /* A general SVE offset is A * VQ + B. Remove the A component from
6006 coefficient 0 in order to get the constant B. */
6007 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6008
6009 /* Split an out-of-range address displacement into a base and
6010 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6011 range otherwise to increase opportunities for sharing the base
6012 address of different sizes. Unaligned accesses use the signed
6013 9-bit range, TImode/TFmode use the intersection of signed
6014 scaled 7-bit and signed 9-bit offset. */
6015 if (mode == TImode || mode == TFmode)
6016 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6017 else if ((const_offset & (size - 1)) != 0)
6018 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6019 else
6020 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6021
6022 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6023 return false;
6024
6025 /* Split the offset into second_offset and the rest. */
6026 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6027 *offset2 = gen_int_mode (second_offset, Pmode);
6028 return true;
6029 }
6030 else
6031 {
6032 /* Get the mode we should use as the basis of the range. For structure
6033 modes this is the mode of one vector. */
6034 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6035 machine_mode step_mode
6036 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6037
6038 /* Get the "mul vl" multiplier we'd like to use. */
6039 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6040 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6041 if (vec_flags & VEC_SVE_DATA)
6042 /* LDR supports a 9-bit range, but the move patterns for
6043 structure modes require all vectors to be in range of the
6044 same base. The simplest way of accomodating that while still
6045 promoting reuse of anchor points between different modes is
6046 to use an 8-bit range unconditionally. */
6047 vnum = ((vnum + 128) & 255) - 128;
6048 else
6049 /* Predicates are only handled singly, so we might as well use
6050 the full range. */
6051 vnum = ((vnum + 256) & 511) - 256;
6052 if (vnum == 0)
6053 return false;
6054
6055 /* Convert the "mul vl" multiplier into a byte offset. */
6056 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6057 if (known_eq (second_offset, orig_offset))
6058 return false;
6059
6060 /* Split the offset into second_offset and the rest. */
6061 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6062 *offset2 = gen_int_mode (second_offset, Pmode);
6063 return true;
6064 }
6065 }
6066
6067 /* Return the binary representation of floating point constant VALUE in INTVAL.
6068 If the value cannot be converted, return false without setting INTVAL.
6069 The conversion is done in the given MODE. */
6070 bool
6071 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6072 {
6073
6074 /* We make a general exception for 0. */
6075 if (aarch64_float_const_zero_rtx_p (value))
6076 {
6077 *intval = 0;
6078 return true;
6079 }
6080
6081 scalar_float_mode mode;
6082 if (GET_CODE (value) != CONST_DOUBLE
6083 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6084 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6085 /* Only support up to DF mode. */
6086 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6087 return false;
6088
6089 unsigned HOST_WIDE_INT ival = 0;
6090
6091 long res[2];
6092 real_to_target (res,
6093 CONST_DOUBLE_REAL_VALUE (value),
6094 REAL_MODE_FORMAT (mode));
6095
6096 if (mode == DFmode)
6097 {
6098 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6099 ival = zext_hwi (res[order], 32);
6100 ival |= (zext_hwi (res[1 - order], 32) << 32);
6101 }
6102 else
6103 ival = zext_hwi (res[0], 32);
6104
6105 *intval = ival;
6106 return true;
6107 }
6108
6109 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6110 single MOV(+MOVK) followed by an FMOV. */
6111 bool
6112 aarch64_float_const_rtx_p (rtx x)
6113 {
6114 machine_mode mode = GET_MODE (x);
6115 if (mode == VOIDmode)
6116 return false;
6117
6118 /* Determine whether it's cheaper to write float constants as
6119 mov/movk pairs over ldr/adrp pairs. */
6120 unsigned HOST_WIDE_INT ival;
6121
6122 if (GET_CODE (x) == CONST_DOUBLE
6123 && SCALAR_FLOAT_MODE_P (mode)
6124 && aarch64_reinterpret_float_as_int (x, &ival))
6125 {
6126 scalar_int_mode imode = (mode == HFmode
6127 ? SImode
6128 : int_mode_for_mode (mode).require ());
6129 int num_instr = aarch64_internal_mov_immediate
6130 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6131 return num_instr < 3;
6132 }
6133
6134 return false;
6135 }
6136
6137 /* Return TRUE if rtx X is immediate constant 0.0 */
6138 bool
6139 aarch64_float_const_zero_rtx_p (rtx x)
6140 {
6141 if (GET_MODE (x) == VOIDmode)
6142 return false;
6143
6144 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6145 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6146 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6147 }
6148
6149 /* Return TRUE if rtx X is immediate constant that fits in a single
6150 MOVI immediate operation. */
6151 bool
6152 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6153 {
6154 if (!TARGET_SIMD)
6155 return false;
6156
6157 machine_mode vmode;
6158 scalar_int_mode imode;
6159 unsigned HOST_WIDE_INT ival;
6160
6161 if (GET_CODE (x) == CONST_DOUBLE
6162 && SCALAR_FLOAT_MODE_P (mode))
6163 {
6164 if (!aarch64_reinterpret_float_as_int (x, &ival))
6165 return false;
6166
6167 /* We make a general exception for 0. */
6168 if (aarch64_float_const_zero_rtx_p (x))
6169 return true;
6170
6171 imode = int_mode_for_mode (mode).require ();
6172 }
6173 else if (GET_CODE (x) == CONST_INT
6174 && is_a <scalar_int_mode> (mode, &imode))
6175 ival = INTVAL (x);
6176 else
6177 return false;
6178
6179 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6180 a 128 bit vector mode. */
6181 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6182
6183 vmode = aarch64_simd_container_mode (imode, width);
6184 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6185
6186 return aarch64_simd_valid_immediate (v_op, NULL);
6187 }
6188
6189
6190 /* Return the fixed registers used for condition codes. */
6191
6192 static bool
6193 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6194 {
6195 *p1 = CC_REGNUM;
6196 *p2 = INVALID_REGNUM;
6197 return true;
6198 }
6199
6200 /* This function is used by the call expanders of the machine description.
6201 RESULT is the register in which the result is returned. It's NULL for
6202 "call" and "sibcall".
6203 MEM is the location of the function call.
6204 SIBCALL indicates whether this function call is normal call or sibling call.
6205 It will generate different pattern accordingly. */
6206
6207 void
6208 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6209 {
6210 rtx call, callee, tmp;
6211 rtvec vec;
6212 machine_mode mode;
6213
6214 gcc_assert (MEM_P (mem));
6215 callee = XEXP (mem, 0);
6216 mode = GET_MODE (callee);
6217 gcc_assert (mode == Pmode);
6218
6219 /* Decide if we should generate indirect calls by loading the
6220 address of the callee into a register before performing
6221 the branch-and-link. */
6222 if (SYMBOL_REF_P (callee)
6223 ? (aarch64_is_long_call_p (callee)
6224 || aarch64_is_noplt_call_p (callee))
6225 : !REG_P (callee))
6226 XEXP (mem, 0) = force_reg (mode, callee);
6227
6228 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6229
6230 if (result != NULL_RTX)
6231 call = gen_rtx_SET (result, call);
6232
6233 if (sibcall)
6234 tmp = ret_rtx;
6235 else
6236 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6237
6238 vec = gen_rtvec (2, call, tmp);
6239 call = gen_rtx_PARALLEL (VOIDmode, vec);
6240
6241 aarch64_emit_call_insn (call);
6242 }
6243
6244 /* Emit call insn with PAT and do aarch64-specific handling. */
6245
6246 void
6247 aarch64_emit_call_insn (rtx pat)
6248 {
6249 rtx insn = emit_call_insn (pat);
6250
6251 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6252 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6253 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6254 }
6255
6256 machine_mode
6257 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6258 {
6259 /* All floating point compares return CCFP if it is an equality
6260 comparison, and CCFPE otherwise. */
6261 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6262 {
6263 switch (code)
6264 {
6265 case EQ:
6266 case NE:
6267 case UNORDERED:
6268 case ORDERED:
6269 case UNLT:
6270 case UNLE:
6271 case UNGT:
6272 case UNGE:
6273 case UNEQ:
6274 return CCFPmode;
6275
6276 case LT:
6277 case LE:
6278 case GT:
6279 case GE:
6280 case LTGT:
6281 return CCFPEmode;
6282
6283 default:
6284 gcc_unreachable ();
6285 }
6286 }
6287
6288 /* Equality comparisons of short modes against zero can be performed
6289 using the TST instruction with the appropriate bitmask. */
6290 if (y == const0_rtx && REG_P (x)
6291 && (code == EQ || code == NE)
6292 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6293 return CC_NZmode;
6294
6295 /* Similarly, comparisons of zero_extends from shorter modes can
6296 be performed using an ANDS with an immediate mask. */
6297 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6298 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6299 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6300 && (code == EQ || code == NE))
6301 return CC_NZmode;
6302
6303 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6304 && y == const0_rtx
6305 && (code == EQ || code == NE || code == LT || code == GE)
6306 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6307 || GET_CODE (x) == NEG
6308 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6309 && CONST_INT_P (XEXP (x, 2)))))
6310 return CC_NZmode;
6311
6312 /* A compare with a shifted operand. Because of canonicalization,
6313 the comparison will have to be swapped when we emit the assembly
6314 code. */
6315 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6316 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6317 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6318 || GET_CODE (x) == LSHIFTRT
6319 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6320 return CC_SWPmode;
6321
6322 /* Similarly for a negated operand, but we can only do this for
6323 equalities. */
6324 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6325 && (REG_P (y) || GET_CODE (y) == SUBREG)
6326 && (code == EQ || code == NE)
6327 && GET_CODE (x) == NEG)
6328 return CC_Zmode;
6329
6330 /* A test for unsigned overflow. */
6331 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6332 && code == NE
6333 && GET_CODE (x) == PLUS
6334 && GET_CODE (y) == ZERO_EXTEND)
6335 return CC_Cmode;
6336
6337 /* For everything else, return CCmode. */
6338 return CCmode;
6339 }
6340
6341 static int
6342 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6343
6344 int
6345 aarch64_get_condition_code (rtx x)
6346 {
6347 machine_mode mode = GET_MODE (XEXP (x, 0));
6348 enum rtx_code comp_code = GET_CODE (x);
6349
6350 if (GET_MODE_CLASS (mode) != MODE_CC)
6351 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6352 return aarch64_get_condition_code_1 (mode, comp_code);
6353 }
6354
6355 static int
6356 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6357 {
6358 switch (mode)
6359 {
6360 case E_CCFPmode:
6361 case E_CCFPEmode:
6362 switch (comp_code)
6363 {
6364 case GE: return AARCH64_GE;
6365 case GT: return AARCH64_GT;
6366 case LE: return AARCH64_LS;
6367 case LT: return AARCH64_MI;
6368 case NE: return AARCH64_NE;
6369 case EQ: return AARCH64_EQ;
6370 case ORDERED: return AARCH64_VC;
6371 case UNORDERED: return AARCH64_VS;
6372 case UNLT: return AARCH64_LT;
6373 case UNLE: return AARCH64_LE;
6374 case UNGT: return AARCH64_HI;
6375 case UNGE: return AARCH64_PL;
6376 default: return -1;
6377 }
6378 break;
6379
6380 case E_CCmode:
6381 switch (comp_code)
6382 {
6383 case NE: return AARCH64_NE;
6384 case EQ: return AARCH64_EQ;
6385 case GE: return AARCH64_GE;
6386 case GT: return AARCH64_GT;
6387 case LE: return AARCH64_LE;
6388 case LT: return AARCH64_LT;
6389 case GEU: return AARCH64_CS;
6390 case GTU: return AARCH64_HI;
6391 case LEU: return AARCH64_LS;
6392 case LTU: return AARCH64_CC;
6393 default: return -1;
6394 }
6395 break;
6396
6397 case E_CC_SWPmode:
6398 switch (comp_code)
6399 {
6400 case NE: return AARCH64_NE;
6401 case EQ: return AARCH64_EQ;
6402 case GE: return AARCH64_LE;
6403 case GT: return AARCH64_LT;
6404 case LE: return AARCH64_GE;
6405 case LT: return AARCH64_GT;
6406 case GEU: return AARCH64_LS;
6407 case GTU: return AARCH64_CC;
6408 case LEU: return AARCH64_CS;
6409 case LTU: return AARCH64_HI;
6410 default: return -1;
6411 }
6412 break;
6413
6414 case E_CC_NZmode:
6415 switch (comp_code)
6416 {
6417 case NE: return AARCH64_NE;
6418 case EQ: return AARCH64_EQ;
6419 case GE: return AARCH64_PL;
6420 case LT: return AARCH64_MI;
6421 default: return -1;
6422 }
6423 break;
6424
6425 case E_CC_Zmode:
6426 switch (comp_code)
6427 {
6428 case NE: return AARCH64_NE;
6429 case EQ: return AARCH64_EQ;
6430 default: return -1;
6431 }
6432 break;
6433
6434 case E_CC_Cmode:
6435 switch (comp_code)
6436 {
6437 case NE: return AARCH64_CS;
6438 case EQ: return AARCH64_CC;
6439 default: return -1;
6440 }
6441 break;
6442
6443 default:
6444 return -1;
6445 }
6446
6447 return -1;
6448 }
6449
6450 bool
6451 aarch64_const_vec_all_same_in_range_p (rtx x,
6452 HOST_WIDE_INT minval,
6453 HOST_WIDE_INT maxval)
6454 {
6455 rtx elt;
6456 return (const_vec_duplicate_p (x, &elt)
6457 && CONST_INT_P (elt)
6458 && IN_RANGE (INTVAL (elt), minval, maxval));
6459 }
6460
6461 bool
6462 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6463 {
6464 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6465 }
6466
6467 /* Return true if VEC is a constant in which every element is in the range
6468 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6469
6470 static bool
6471 aarch64_const_vec_all_in_range_p (rtx vec,
6472 HOST_WIDE_INT minval,
6473 HOST_WIDE_INT maxval)
6474 {
6475 if (GET_CODE (vec) != CONST_VECTOR
6476 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6477 return false;
6478
6479 int nunits;
6480 if (!CONST_VECTOR_STEPPED_P (vec))
6481 nunits = const_vector_encoded_nelts (vec);
6482 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6483 return false;
6484
6485 for (int i = 0; i < nunits; i++)
6486 {
6487 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6488 if (!CONST_INT_P (vec_elem)
6489 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6490 return false;
6491 }
6492 return true;
6493 }
6494
6495 /* N Z C V. */
6496 #define AARCH64_CC_V 1
6497 #define AARCH64_CC_C (1 << 1)
6498 #define AARCH64_CC_Z (1 << 2)
6499 #define AARCH64_CC_N (1 << 3)
6500
6501 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6502 static const int aarch64_nzcv_codes[] =
6503 {
6504 0, /* EQ, Z == 1. */
6505 AARCH64_CC_Z, /* NE, Z == 0. */
6506 0, /* CS, C == 1. */
6507 AARCH64_CC_C, /* CC, C == 0. */
6508 0, /* MI, N == 1. */
6509 AARCH64_CC_N, /* PL, N == 0. */
6510 0, /* VS, V == 1. */
6511 AARCH64_CC_V, /* VC, V == 0. */
6512 0, /* HI, C ==1 && Z == 0. */
6513 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6514 AARCH64_CC_V, /* GE, N == V. */
6515 0, /* LT, N != V. */
6516 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6517 0, /* LE, !(Z == 0 && N == V). */
6518 0, /* AL, Any. */
6519 0 /* NV, Any. */
6520 };
6521
6522 /* Print floating-point vector immediate operand X to F, negating it
6523 first if NEGATE is true. Return true on success, false if it isn't
6524 a constant we can handle. */
6525
6526 static bool
6527 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6528 {
6529 rtx elt;
6530
6531 if (!const_vec_duplicate_p (x, &elt))
6532 return false;
6533
6534 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6535 if (negate)
6536 r = real_value_negate (&r);
6537
6538 /* We only handle the SVE single-bit immediates here. */
6539 if (real_equal (&r, &dconst0))
6540 asm_fprintf (f, "0.0");
6541 else if (real_equal (&r, &dconst1))
6542 asm_fprintf (f, "1.0");
6543 else if (real_equal (&r, &dconsthalf))
6544 asm_fprintf (f, "0.5");
6545 else
6546 return false;
6547
6548 return true;
6549 }
6550
6551 /* Return the equivalent letter for size. */
6552 static char
6553 sizetochar (int size)
6554 {
6555 switch (size)
6556 {
6557 case 64: return 'd';
6558 case 32: return 's';
6559 case 16: return 'h';
6560 case 8 : return 'b';
6561 default: gcc_unreachable ();
6562 }
6563 }
6564
6565 /* Print operand X to file F in a target specific manner according to CODE.
6566 The acceptable formatting commands given by CODE are:
6567 'c': An integer or symbol address without a preceding #
6568 sign.
6569 'C': Take the duplicated element in a vector constant
6570 and print it in hex.
6571 'D': Take the duplicated element in a vector constant
6572 and print it as an unsigned integer, in decimal.
6573 'e': Print the sign/zero-extend size as a character 8->b,
6574 16->h, 32->w.
6575 'p': Prints N such that 2^N == X (X must be power of 2 and
6576 const int).
6577 'P': Print the number of non-zero bits in X (a const_int).
6578 'H': Print the higher numbered register of a pair (TImode)
6579 of regs.
6580 'm': Print a condition (eq, ne, etc).
6581 'M': Same as 'm', but invert condition.
6582 'N': Take the duplicated element in a vector constant
6583 and print the negative of it in decimal.
6584 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6585 'S/T/U/V': Print a FP/SIMD register name for a register list.
6586 The register printed is the FP/SIMD register name
6587 of X + 0/1/2/3 for S/T/U/V.
6588 'R': Print a scalar FP/SIMD register name + 1.
6589 'X': Print bottom 16 bits of integer constant in hex.
6590 'w/x': Print a general register name or the zero register
6591 (32-bit or 64-bit).
6592 '0': Print a normal operand, if it's a general register,
6593 then we assume DImode.
6594 'k': Print NZCV for conditional compare instructions.
6595 'A': Output address constant representing the first
6596 argument of X, specifying a relocation offset
6597 if appropriate.
6598 'L': Output constant address specified by X
6599 with a relocation offset if appropriate.
6600 'G': Prints address of X, specifying a PC relative
6601 relocation mode if appropriate.
6602 'y': Output address of LDP or STP - this is used for
6603 some LDP/STPs which don't use a PARALLEL in their
6604 pattern (so the mode needs to be adjusted).
6605 'z': Output address of a typical LDP or STP. */
6606
6607 static void
6608 aarch64_print_operand (FILE *f, rtx x, int code)
6609 {
6610 rtx elt;
6611 switch (code)
6612 {
6613 case 'c':
6614 switch (GET_CODE (x))
6615 {
6616 case CONST_INT:
6617 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6618 break;
6619
6620 case SYMBOL_REF:
6621 output_addr_const (f, x);
6622 break;
6623
6624 case CONST:
6625 if (GET_CODE (XEXP (x, 0)) == PLUS
6626 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6627 {
6628 output_addr_const (f, x);
6629 break;
6630 }
6631 /* Fall through. */
6632
6633 default:
6634 output_operand_lossage ("unsupported operand for code '%c'", code);
6635 }
6636 break;
6637
6638 case 'e':
6639 {
6640 int n;
6641
6642 if (!CONST_INT_P (x)
6643 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6644 {
6645 output_operand_lossage ("invalid operand for '%%%c'", code);
6646 return;
6647 }
6648
6649 switch (n)
6650 {
6651 case 3:
6652 fputc ('b', f);
6653 break;
6654 case 4:
6655 fputc ('h', f);
6656 break;
6657 case 5:
6658 fputc ('w', f);
6659 break;
6660 default:
6661 output_operand_lossage ("invalid operand for '%%%c'", code);
6662 return;
6663 }
6664 }
6665 break;
6666
6667 case 'p':
6668 {
6669 int n;
6670
6671 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6672 {
6673 output_operand_lossage ("invalid operand for '%%%c'", code);
6674 return;
6675 }
6676
6677 asm_fprintf (f, "%d", n);
6678 }
6679 break;
6680
6681 case 'P':
6682 if (!CONST_INT_P (x))
6683 {
6684 output_operand_lossage ("invalid operand for '%%%c'", code);
6685 return;
6686 }
6687
6688 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6689 break;
6690
6691 case 'H':
6692 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6693 {
6694 output_operand_lossage ("invalid operand for '%%%c'", code);
6695 return;
6696 }
6697
6698 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6699 break;
6700
6701 case 'M':
6702 case 'm':
6703 {
6704 int cond_code;
6705 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6706 if (x == const_true_rtx)
6707 {
6708 if (code == 'M')
6709 fputs ("nv", f);
6710 return;
6711 }
6712
6713 if (!COMPARISON_P (x))
6714 {
6715 output_operand_lossage ("invalid operand for '%%%c'", code);
6716 return;
6717 }
6718
6719 cond_code = aarch64_get_condition_code (x);
6720 gcc_assert (cond_code >= 0);
6721 if (code == 'M')
6722 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6723 fputs (aarch64_condition_codes[cond_code], f);
6724 }
6725 break;
6726
6727 case 'N':
6728 if (!const_vec_duplicate_p (x, &elt))
6729 {
6730 output_operand_lossage ("invalid vector constant");
6731 return;
6732 }
6733
6734 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6735 asm_fprintf (f, "%wd", -INTVAL (elt));
6736 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6737 && aarch64_print_vector_float_operand (f, x, true))
6738 ;
6739 else
6740 {
6741 output_operand_lossage ("invalid vector constant");
6742 return;
6743 }
6744 break;
6745
6746 case 'b':
6747 case 'h':
6748 case 's':
6749 case 'd':
6750 case 'q':
6751 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6752 {
6753 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6754 return;
6755 }
6756 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6757 break;
6758
6759 case 'S':
6760 case 'T':
6761 case 'U':
6762 case 'V':
6763 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6764 {
6765 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6766 return;
6767 }
6768 asm_fprintf (f, "%c%d",
6769 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6770 REGNO (x) - V0_REGNUM + (code - 'S'));
6771 break;
6772
6773 case 'R':
6774 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6775 {
6776 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6777 return;
6778 }
6779 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6780 break;
6781
6782 case 'X':
6783 if (!CONST_INT_P (x))
6784 {
6785 output_operand_lossage ("invalid operand for '%%%c'", code);
6786 return;
6787 }
6788 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6789 break;
6790
6791 case 'C':
6792 {
6793 /* Print a replicated constant in hex. */
6794 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6795 {
6796 output_operand_lossage ("invalid operand for '%%%c'", code);
6797 return;
6798 }
6799 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6800 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6801 }
6802 break;
6803
6804 case 'D':
6805 {
6806 /* Print a replicated constant in decimal, treating it as
6807 unsigned. */
6808 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6809 {
6810 output_operand_lossage ("invalid operand for '%%%c'", code);
6811 return;
6812 }
6813 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6814 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6815 }
6816 break;
6817
6818 case 'w':
6819 case 'x':
6820 if (x == const0_rtx
6821 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6822 {
6823 asm_fprintf (f, "%czr", code);
6824 break;
6825 }
6826
6827 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6828 {
6829 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6830 break;
6831 }
6832
6833 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6834 {
6835 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6836 break;
6837 }
6838
6839 /* Fall through */
6840
6841 case 0:
6842 if (x == NULL)
6843 {
6844 output_operand_lossage ("missing operand");
6845 return;
6846 }
6847
6848 switch (GET_CODE (x))
6849 {
6850 case REG:
6851 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6852 {
6853 if (REG_NREGS (x) == 1)
6854 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6855 else
6856 {
6857 char suffix
6858 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6859 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6860 REGNO (x) - V0_REGNUM, suffix,
6861 END_REGNO (x) - V0_REGNUM - 1, suffix);
6862 }
6863 }
6864 else
6865 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6866 break;
6867
6868 case MEM:
6869 output_address (GET_MODE (x), XEXP (x, 0));
6870 break;
6871
6872 case LABEL_REF:
6873 case SYMBOL_REF:
6874 output_addr_const (asm_out_file, x);
6875 break;
6876
6877 case CONST_INT:
6878 asm_fprintf (f, "%wd", INTVAL (x));
6879 break;
6880
6881 case CONST:
6882 if (!VECTOR_MODE_P (GET_MODE (x)))
6883 {
6884 output_addr_const (asm_out_file, x);
6885 break;
6886 }
6887 /* fall through */
6888
6889 case CONST_VECTOR:
6890 if (!const_vec_duplicate_p (x, &elt))
6891 {
6892 output_operand_lossage ("invalid vector constant");
6893 return;
6894 }
6895
6896 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6897 asm_fprintf (f, "%wd", INTVAL (elt));
6898 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6899 && aarch64_print_vector_float_operand (f, x, false))
6900 ;
6901 else
6902 {
6903 output_operand_lossage ("invalid vector constant");
6904 return;
6905 }
6906 break;
6907
6908 case CONST_DOUBLE:
6909 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6910 be getting CONST_DOUBLEs holding integers. */
6911 gcc_assert (GET_MODE (x) != VOIDmode);
6912 if (aarch64_float_const_zero_rtx_p (x))
6913 {
6914 fputc ('0', f);
6915 break;
6916 }
6917 else if (aarch64_float_const_representable_p (x))
6918 {
6919 #define buf_size 20
6920 char float_buf[buf_size] = {'\0'};
6921 real_to_decimal_for_mode (float_buf,
6922 CONST_DOUBLE_REAL_VALUE (x),
6923 buf_size, buf_size,
6924 1, GET_MODE (x));
6925 asm_fprintf (asm_out_file, "%s", float_buf);
6926 break;
6927 #undef buf_size
6928 }
6929 output_operand_lossage ("invalid constant");
6930 return;
6931 default:
6932 output_operand_lossage ("invalid operand");
6933 return;
6934 }
6935 break;
6936
6937 case 'A':
6938 if (GET_CODE (x) == HIGH)
6939 x = XEXP (x, 0);
6940
6941 switch (aarch64_classify_symbolic_expression (x))
6942 {
6943 case SYMBOL_SMALL_GOT_4G:
6944 asm_fprintf (asm_out_file, ":got:");
6945 break;
6946
6947 case SYMBOL_SMALL_TLSGD:
6948 asm_fprintf (asm_out_file, ":tlsgd:");
6949 break;
6950
6951 case SYMBOL_SMALL_TLSDESC:
6952 asm_fprintf (asm_out_file, ":tlsdesc:");
6953 break;
6954
6955 case SYMBOL_SMALL_TLSIE:
6956 asm_fprintf (asm_out_file, ":gottprel:");
6957 break;
6958
6959 case SYMBOL_TLSLE24:
6960 asm_fprintf (asm_out_file, ":tprel:");
6961 break;
6962
6963 case SYMBOL_TINY_GOT:
6964 gcc_unreachable ();
6965 break;
6966
6967 default:
6968 break;
6969 }
6970 output_addr_const (asm_out_file, x);
6971 break;
6972
6973 case 'L':
6974 switch (aarch64_classify_symbolic_expression (x))
6975 {
6976 case SYMBOL_SMALL_GOT_4G:
6977 asm_fprintf (asm_out_file, ":lo12:");
6978 break;
6979
6980 case SYMBOL_SMALL_TLSGD:
6981 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6982 break;
6983
6984 case SYMBOL_SMALL_TLSDESC:
6985 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6986 break;
6987
6988 case SYMBOL_SMALL_TLSIE:
6989 asm_fprintf (asm_out_file, ":gottprel_lo12:");
6990 break;
6991
6992 case SYMBOL_TLSLE12:
6993 asm_fprintf (asm_out_file, ":tprel_lo12:");
6994 break;
6995
6996 case SYMBOL_TLSLE24:
6997 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6998 break;
6999
7000 case SYMBOL_TINY_GOT:
7001 asm_fprintf (asm_out_file, ":got:");
7002 break;
7003
7004 case SYMBOL_TINY_TLSIE:
7005 asm_fprintf (asm_out_file, ":gottprel:");
7006 break;
7007
7008 default:
7009 break;
7010 }
7011 output_addr_const (asm_out_file, x);
7012 break;
7013
7014 case 'G':
7015 switch (aarch64_classify_symbolic_expression (x))
7016 {
7017 case SYMBOL_TLSLE24:
7018 asm_fprintf (asm_out_file, ":tprel_hi12:");
7019 break;
7020 default:
7021 break;
7022 }
7023 output_addr_const (asm_out_file, x);
7024 break;
7025
7026 case 'k':
7027 {
7028 HOST_WIDE_INT cond_code;
7029
7030 if (!CONST_INT_P (x))
7031 {
7032 output_operand_lossage ("invalid operand for '%%%c'", code);
7033 return;
7034 }
7035
7036 cond_code = INTVAL (x);
7037 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7038 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7039 }
7040 break;
7041
7042 case 'y':
7043 case 'z':
7044 {
7045 machine_mode mode = GET_MODE (x);
7046
7047 if (GET_CODE (x) != MEM
7048 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7049 {
7050 output_operand_lossage ("invalid operand for '%%%c'", code);
7051 return;
7052 }
7053
7054 if (code == 'y')
7055 /* LDP/STP which uses a single double-width memory operand.
7056 Adjust the mode to appear like a typical LDP/STP.
7057 Currently this is supported for 16-byte accesses only. */
7058 mode = DFmode;
7059
7060 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7061 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7062 }
7063 break;
7064
7065 default:
7066 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7067 return;
7068 }
7069 }
7070
7071 /* Print address 'x' of a memory access with mode 'mode'.
7072 'op' is the context required by aarch64_classify_address. It can either be
7073 MEM for a normal memory access or PARALLEL for LDP/STP. */
7074 static bool
7075 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7076 aarch64_addr_query_type type)
7077 {
7078 struct aarch64_address_info addr;
7079 unsigned int size;
7080
7081 /* Check all addresses are Pmode - including ILP32. */
7082 if (GET_MODE (x) != Pmode)
7083 output_operand_lossage ("invalid address mode");
7084
7085 if (aarch64_classify_address (&addr, x, mode, true, type))
7086 switch (addr.type)
7087 {
7088 case ADDRESS_REG_IMM:
7089 if (known_eq (addr.const_offset, 0))
7090 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7091 else if (aarch64_sve_data_mode_p (mode))
7092 {
7093 HOST_WIDE_INT vnum
7094 = exact_div (addr.const_offset,
7095 BYTES_PER_SVE_VECTOR).to_constant ();
7096 asm_fprintf (f, "[%s, #%wd, mul vl]",
7097 reg_names[REGNO (addr.base)], vnum);
7098 }
7099 else if (aarch64_sve_pred_mode_p (mode))
7100 {
7101 HOST_WIDE_INT vnum
7102 = exact_div (addr.const_offset,
7103 BYTES_PER_SVE_PRED).to_constant ();
7104 asm_fprintf (f, "[%s, #%wd, mul vl]",
7105 reg_names[REGNO (addr.base)], vnum);
7106 }
7107 else
7108 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7109 INTVAL (addr.offset));
7110 return true;
7111
7112 case ADDRESS_REG_REG:
7113 if (addr.shift == 0)
7114 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7115 reg_names [REGNO (addr.offset)]);
7116 else
7117 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7118 reg_names [REGNO (addr.offset)], addr.shift);
7119 return true;
7120
7121 case ADDRESS_REG_UXTW:
7122 if (addr.shift == 0)
7123 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7124 REGNO (addr.offset) - R0_REGNUM);
7125 else
7126 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7127 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7128 return true;
7129
7130 case ADDRESS_REG_SXTW:
7131 if (addr.shift == 0)
7132 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7133 REGNO (addr.offset) - R0_REGNUM);
7134 else
7135 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7136 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7137 return true;
7138
7139 case ADDRESS_REG_WB:
7140 /* Writeback is only supported for fixed-width modes. */
7141 size = GET_MODE_SIZE (mode).to_constant ();
7142 switch (GET_CODE (x))
7143 {
7144 case PRE_INC:
7145 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7146 return true;
7147 case POST_INC:
7148 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7149 return true;
7150 case PRE_DEC:
7151 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7152 return true;
7153 case POST_DEC:
7154 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7155 return true;
7156 case PRE_MODIFY:
7157 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7158 INTVAL (addr.offset));
7159 return true;
7160 case POST_MODIFY:
7161 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7162 INTVAL (addr.offset));
7163 return true;
7164 default:
7165 break;
7166 }
7167 break;
7168
7169 case ADDRESS_LO_SUM:
7170 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7171 output_addr_const (f, addr.offset);
7172 asm_fprintf (f, "]");
7173 return true;
7174
7175 case ADDRESS_SYMBOLIC:
7176 output_addr_const (f, x);
7177 return true;
7178 }
7179
7180 return false;
7181 }
7182
7183 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7184 static bool
7185 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7186 {
7187 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7188 }
7189
7190 /* Print address 'x' of a memory access with mode 'mode'. */
7191 static void
7192 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7193 {
7194 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7195 output_addr_const (f, x);
7196 }
7197
7198 bool
7199 aarch64_label_mentioned_p (rtx x)
7200 {
7201 const char *fmt;
7202 int i;
7203
7204 if (GET_CODE (x) == LABEL_REF)
7205 return true;
7206
7207 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7208 referencing instruction, but they are constant offsets, not
7209 symbols. */
7210 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7211 return false;
7212
7213 fmt = GET_RTX_FORMAT (GET_CODE (x));
7214 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7215 {
7216 if (fmt[i] == 'E')
7217 {
7218 int j;
7219
7220 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7221 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7222 return 1;
7223 }
7224 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7225 return 1;
7226 }
7227
7228 return 0;
7229 }
7230
7231 /* Implement REGNO_REG_CLASS. */
7232
7233 enum reg_class
7234 aarch64_regno_regclass (unsigned regno)
7235 {
7236 if (GP_REGNUM_P (regno))
7237 return GENERAL_REGS;
7238
7239 if (regno == SP_REGNUM)
7240 return STACK_REG;
7241
7242 if (regno == FRAME_POINTER_REGNUM
7243 || regno == ARG_POINTER_REGNUM)
7244 return POINTER_REGS;
7245
7246 if (FP_REGNUM_P (regno))
7247 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7248
7249 if (PR_REGNUM_P (regno))
7250 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7251
7252 return NO_REGS;
7253 }
7254
7255 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7256 If OFFSET is out of range, return an offset of an anchor point
7257 that is in range. Return 0 otherwise. */
7258
7259 static HOST_WIDE_INT
7260 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7261 machine_mode mode)
7262 {
7263 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7264 if (size > 16)
7265 return (offset + 0x400) & ~0x7f0;
7266
7267 /* For offsets that aren't a multiple of the access size, the limit is
7268 -256...255. */
7269 if (offset & (size - 1))
7270 {
7271 /* BLKmode typically uses LDP of X-registers. */
7272 if (mode == BLKmode)
7273 return (offset + 512) & ~0x3ff;
7274 return (offset + 0x100) & ~0x1ff;
7275 }
7276
7277 /* Small negative offsets are supported. */
7278 if (IN_RANGE (offset, -256, 0))
7279 return 0;
7280
7281 if (mode == TImode || mode == TFmode)
7282 return (offset + 0x100) & ~0x1ff;
7283
7284 /* Use 12-bit offset by access size. */
7285 return offset & (~0xfff * size);
7286 }
7287
7288 static rtx
7289 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7290 {
7291 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7292 where mask is selected by alignment and size of the offset.
7293 We try to pick as large a range for the offset as possible to
7294 maximize the chance of a CSE. However, for aligned addresses
7295 we limit the range to 4k so that structures with different sized
7296 elements are likely to use the same base. We need to be careful
7297 not to split a CONST for some forms of address expression, otherwise
7298 it will generate sub-optimal code. */
7299
7300 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7301 {
7302 rtx base = XEXP (x, 0);
7303 rtx offset_rtx = XEXP (x, 1);
7304 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7305
7306 if (GET_CODE (base) == PLUS)
7307 {
7308 rtx op0 = XEXP (base, 0);
7309 rtx op1 = XEXP (base, 1);
7310
7311 /* Force any scaling into a temp for CSE. */
7312 op0 = force_reg (Pmode, op0);
7313 op1 = force_reg (Pmode, op1);
7314
7315 /* Let the pointer register be in op0. */
7316 if (REG_POINTER (op1))
7317 std::swap (op0, op1);
7318
7319 /* If the pointer is virtual or frame related, then we know that
7320 virtual register instantiation or register elimination is going
7321 to apply a second constant. We want the two constants folded
7322 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7323 if (virt_or_elim_regno_p (REGNO (op0)))
7324 {
7325 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7326 NULL_RTX, true, OPTAB_DIRECT);
7327 return gen_rtx_PLUS (Pmode, base, op1);
7328 }
7329
7330 /* Otherwise, in order to encourage CSE (and thence loop strength
7331 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7332 base = expand_binop (Pmode, add_optab, op0, op1,
7333 NULL_RTX, true, OPTAB_DIRECT);
7334 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7335 }
7336
7337 HOST_WIDE_INT size;
7338 if (GET_MODE_SIZE (mode).is_constant (&size))
7339 {
7340 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7341 mode);
7342 if (base_offset != 0)
7343 {
7344 base = plus_constant (Pmode, base, base_offset);
7345 base = force_operand (base, NULL_RTX);
7346 return plus_constant (Pmode, base, offset - base_offset);
7347 }
7348 }
7349 }
7350
7351 return x;
7352 }
7353
7354 /* Return the reload icode required for a constant pool in mode. */
7355 static enum insn_code
7356 aarch64_constant_pool_reload_icode (machine_mode mode)
7357 {
7358 switch (mode)
7359 {
7360 case E_SFmode:
7361 return CODE_FOR_aarch64_reload_movcpsfdi;
7362
7363 case E_DFmode:
7364 return CODE_FOR_aarch64_reload_movcpdfdi;
7365
7366 case E_TFmode:
7367 return CODE_FOR_aarch64_reload_movcptfdi;
7368
7369 case E_V8QImode:
7370 return CODE_FOR_aarch64_reload_movcpv8qidi;
7371
7372 case E_V16QImode:
7373 return CODE_FOR_aarch64_reload_movcpv16qidi;
7374
7375 case E_V4HImode:
7376 return CODE_FOR_aarch64_reload_movcpv4hidi;
7377
7378 case E_V8HImode:
7379 return CODE_FOR_aarch64_reload_movcpv8hidi;
7380
7381 case E_V2SImode:
7382 return CODE_FOR_aarch64_reload_movcpv2sidi;
7383
7384 case E_V4SImode:
7385 return CODE_FOR_aarch64_reload_movcpv4sidi;
7386
7387 case E_V2DImode:
7388 return CODE_FOR_aarch64_reload_movcpv2didi;
7389
7390 case E_V2DFmode:
7391 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7392
7393 default:
7394 gcc_unreachable ();
7395 }
7396
7397 gcc_unreachable ();
7398 }
7399 static reg_class_t
7400 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7401 reg_class_t rclass,
7402 machine_mode mode,
7403 secondary_reload_info *sri)
7404 {
7405 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7406 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7407 comment at the head of aarch64-sve.md for more details about the
7408 big-endian handling. */
7409 if (BYTES_BIG_ENDIAN
7410 && reg_class_subset_p (rclass, FP_REGS)
7411 && !((REG_P (x) && HARD_REGISTER_P (x))
7412 || aarch64_simd_valid_immediate (x, NULL))
7413 && aarch64_sve_data_mode_p (mode))
7414 {
7415 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7416 return NO_REGS;
7417 }
7418
7419 /* If we have to disable direct literal pool loads and stores because the
7420 function is too big, then we need a scratch register. */
7421 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7422 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7423 || targetm.vector_mode_supported_p (GET_MODE (x)))
7424 && !aarch64_pcrelative_literal_loads)
7425 {
7426 sri->icode = aarch64_constant_pool_reload_icode (mode);
7427 return NO_REGS;
7428 }
7429
7430 /* Without the TARGET_SIMD instructions we cannot move a Q register
7431 to a Q register directly. We need a scratch. */
7432 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7433 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7434 && reg_class_subset_p (rclass, FP_REGS))
7435 {
7436 if (mode == TFmode)
7437 sri->icode = CODE_FOR_aarch64_reload_movtf;
7438 else if (mode == TImode)
7439 sri->icode = CODE_FOR_aarch64_reload_movti;
7440 return NO_REGS;
7441 }
7442
7443 /* A TFmode or TImode memory access should be handled via an FP_REGS
7444 because AArch64 has richer addressing modes for LDR/STR instructions
7445 than LDP/STP instructions. */
7446 if (TARGET_FLOAT && rclass == GENERAL_REGS
7447 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7448 return FP_REGS;
7449
7450 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7451 return GENERAL_REGS;
7452
7453 return NO_REGS;
7454 }
7455
7456 static bool
7457 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7458 {
7459 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7460
7461 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7462 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7463 if (frame_pointer_needed)
7464 return to == HARD_FRAME_POINTER_REGNUM;
7465 return true;
7466 }
7467
7468 poly_int64
7469 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7470 {
7471 aarch64_layout_frame ();
7472
7473 if (to == HARD_FRAME_POINTER_REGNUM)
7474 {
7475 if (from == ARG_POINTER_REGNUM)
7476 return cfun->machine->frame.hard_fp_offset;
7477
7478 if (from == FRAME_POINTER_REGNUM)
7479 return cfun->machine->frame.hard_fp_offset
7480 - cfun->machine->frame.locals_offset;
7481 }
7482
7483 if (to == STACK_POINTER_REGNUM)
7484 {
7485 if (from == FRAME_POINTER_REGNUM)
7486 return cfun->machine->frame.frame_size
7487 - cfun->machine->frame.locals_offset;
7488 }
7489
7490 return cfun->machine->frame.frame_size;
7491 }
7492
7493 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7494 previous frame. */
7495
7496 rtx
7497 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7498 {
7499 if (count != 0)
7500 return const0_rtx;
7501 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7502 }
7503
7504
7505 static void
7506 aarch64_asm_trampoline_template (FILE *f)
7507 {
7508 if (TARGET_ILP32)
7509 {
7510 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7511 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7512 }
7513 else
7514 {
7515 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7516 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7517 }
7518 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7519 assemble_aligned_integer (4, const0_rtx);
7520 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7521 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7522 }
7523
7524 static void
7525 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7526 {
7527 rtx fnaddr, mem, a_tramp;
7528 const int tramp_code_sz = 16;
7529
7530 /* Don't need to copy the trailing D-words, we fill those in below. */
7531 emit_block_move (m_tramp, assemble_trampoline_template (),
7532 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7533 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7534 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7535 if (GET_MODE (fnaddr) != ptr_mode)
7536 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7537 emit_move_insn (mem, fnaddr);
7538
7539 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7540 emit_move_insn (mem, chain_value);
7541
7542 /* XXX We should really define a "clear_cache" pattern and use
7543 gen_clear_cache(). */
7544 a_tramp = XEXP (m_tramp, 0);
7545 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7546 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7547 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7548 ptr_mode);
7549 }
7550
7551 static unsigned char
7552 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7553 {
7554 /* ??? Logically we should only need to provide a value when
7555 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7556 can hold MODE, but at the moment we need to handle all modes.
7557 Just ignore any runtime parts for registers that can't store them. */
7558 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7559 unsigned int nregs;
7560 switch (regclass)
7561 {
7562 case TAILCALL_ADDR_REGS:
7563 case POINTER_REGS:
7564 case GENERAL_REGS:
7565 case ALL_REGS:
7566 case POINTER_AND_FP_REGS:
7567 case FP_REGS:
7568 case FP_LO_REGS:
7569 if (aarch64_sve_data_mode_p (mode)
7570 && constant_multiple_p (GET_MODE_SIZE (mode),
7571 BYTES_PER_SVE_VECTOR, &nregs))
7572 return nregs;
7573 return (aarch64_vector_data_mode_p (mode)
7574 ? CEIL (lowest_size, UNITS_PER_VREG)
7575 : CEIL (lowest_size, UNITS_PER_WORD));
7576 case STACK_REG:
7577 case PR_REGS:
7578 case PR_LO_REGS:
7579 case PR_HI_REGS:
7580 return 1;
7581
7582 case NO_REGS:
7583 return 0;
7584
7585 default:
7586 break;
7587 }
7588 gcc_unreachable ();
7589 }
7590
7591 static reg_class_t
7592 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7593 {
7594 if (regclass == POINTER_REGS)
7595 return GENERAL_REGS;
7596
7597 if (regclass == STACK_REG)
7598 {
7599 if (REG_P(x)
7600 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7601 return regclass;
7602
7603 return NO_REGS;
7604 }
7605
7606 /* Register eliminiation can result in a request for
7607 SP+constant->FP_REGS. We cannot support such operations which
7608 use SP as source and an FP_REG as destination, so reject out
7609 right now. */
7610 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7611 {
7612 rtx lhs = XEXP (x, 0);
7613
7614 /* Look through a possible SUBREG introduced by ILP32. */
7615 if (GET_CODE (lhs) == SUBREG)
7616 lhs = SUBREG_REG (lhs);
7617
7618 gcc_assert (REG_P (lhs));
7619 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7620 POINTER_REGS));
7621 return NO_REGS;
7622 }
7623
7624 return regclass;
7625 }
7626
7627 void
7628 aarch64_asm_output_labelref (FILE* f, const char *name)
7629 {
7630 asm_fprintf (f, "%U%s", name);
7631 }
7632
7633 static void
7634 aarch64_elf_asm_constructor (rtx symbol, int priority)
7635 {
7636 if (priority == DEFAULT_INIT_PRIORITY)
7637 default_ctor_section_asm_out_constructor (symbol, priority);
7638 else
7639 {
7640 section *s;
7641 /* While priority is known to be in range [0, 65535], so 18 bytes
7642 would be enough, the compiler might not know that. To avoid
7643 -Wformat-truncation false positive, use a larger size. */
7644 char buf[23];
7645 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7646 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7647 switch_to_section (s);
7648 assemble_align (POINTER_SIZE);
7649 assemble_aligned_integer (POINTER_BYTES, symbol);
7650 }
7651 }
7652
7653 static void
7654 aarch64_elf_asm_destructor (rtx symbol, int priority)
7655 {
7656 if (priority == DEFAULT_INIT_PRIORITY)
7657 default_dtor_section_asm_out_destructor (symbol, priority);
7658 else
7659 {
7660 section *s;
7661 /* While priority is known to be in range [0, 65535], so 18 bytes
7662 would be enough, the compiler might not know that. To avoid
7663 -Wformat-truncation false positive, use a larger size. */
7664 char buf[23];
7665 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7666 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7667 switch_to_section (s);
7668 assemble_align (POINTER_SIZE);
7669 assemble_aligned_integer (POINTER_BYTES, symbol);
7670 }
7671 }
7672
7673 const char*
7674 aarch64_output_casesi (rtx *operands)
7675 {
7676 char buf[100];
7677 char label[100];
7678 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7679 int index;
7680 static const char *const patterns[4][2] =
7681 {
7682 {
7683 "ldrb\t%w3, [%0,%w1,uxtw]",
7684 "add\t%3, %4, %w3, sxtb #2"
7685 },
7686 {
7687 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7688 "add\t%3, %4, %w3, sxth #2"
7689 },
7690 {
7691 "ldr\t%w3, [%0,%w1,uxtw #2]",
7692 "add\t%3, %4, %w3, sxtw #2"
7693 },
7694 /* We assume that DImode is only generated when not optimizing and
7695 that we don't really need 64-bit address offsets. That would
7696 imply an object file with 8GB of code in a single function! */
7697 {
7698 "ldr\t%w3, [%0,%w1,uxtw #2]",
7699 "add\t%3, %4, %w3, sxtw #2"
7700 }
7701 };
7702
7703 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7704
7705 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7706 index = exact_log2 (GET_MODE_SIZE (mode));
7707
7708 gcc_assert (index >= 0 && index <= 3);
7709
7710 /* Need to implement table size reduction, by chaning the code below. */
7711 output_asm_insn (patterns[index][0], operands);
7712 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7713 snprintf (buf, sizeof (buf),
7714 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7715 output_asm_insn (buf, operands);
7716 output_asm_insn (patterns[index][1], operands);
7717 output_asm_insn ("br\t%3", operands);
7718 assemble_label (asm_out_file, label);
7719 return "";
7720 }
7721
7722
7723 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7724 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7725 operator. */
7726
7727 int
7728 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7729 {
7730 if (shift >= 0 && shift <= 3)
7731 {
7732 int size;
7733 for (size = 8; size <= 32; size *= 2)
7734 {
7735 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7736 if (mask == bits << shift)
7737 return size;
7738 }
7739 }
7740 return 0;
7741 }
7742
7743 /* Constant pools are per function only when PC relative
7744 literal loads are true or we are in the large memory
7745 model. */
7746
7747 static inline bool
7748 aarch64_can_use_per_function_literal_pools_p (void)
7749 {
7750 return (aarch64_pcrelative_literal_loads
7751 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7752 }
7753
7754 static bool
7755 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7756 {
7757 /* We can't use blocks for constants when we're using a per-function
7758 constant pool. */
7759 return !aarch64_can_use_per_function_literal_pools_p ();
7760 }
7761
7762 /* Select appropriate section for constants depending
7763 on where we place literal pools. */
7764
7765 static section *
7766 aarch64_select_rtx_section (machine_mode mode,
7767 rtx x,
7768 unsigned HOST_WIDE_INT align)
7769 {
7770 if (aarch64_can_use_per_function_literal_pools_p ())
7771 return function_section (current_function_decl);
7772
7773 return default_elf_select_rtx_section (mode, x, align);
7774 }
7775
7776 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7777 void
7778 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7779 HOST_WIDE_INT offset)
7780 {
7781 /* When using per-function literal pools, we must ensure that any code
7782 section is aligned to the minimal instruction length, lest we get
7783 errors from the assembler re "unaligned instructions". */
7784 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7785 ASM_OUTPUT_ALIGN (f, 2);
7786 }
7787
7788 /* Costs. */
7789
7790 /* Helper function for rtx cost calculation. Strip a shift expression
7791 from X. Returns the inner operand if successful, or the original
7792 expression on failure. */
7793 static rtx
7794 aarch64_strip_shift (rtx x)
7795 {
7796 rtx op = x;
7797
7798 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7799 we can convert both to ROR during final output. */
7800 if ((GET_CODE (op) == ASHIFT
7801 || GET_CODE (op) == ASHIFTRT
7802 || GET_CODE (op) == LSHIFTRT
7803 || GET_CODE (op) == ROTATERT
7804 || GET_CODE (op) == ROTATE)
7805 && CONST_INT_P (XEXP (op, 1)))
7806 return XEXP (op, 0);
7807
7808 if (GET_CODE (op) == MULT
7809 && CONST_INT_P (XEXP (op, 1))
7810 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7811 return XEXP (op, 0);
7812
7813 return x;
7814 }
7815
7816 /* Helper function for rtx cost calculation. Strip an extend
7817 expression from X. Returns the inner operand if successful, or the
7818 original expression on failure. We deal with a number of possible
7819 canonicalization variations here. If STRIP_SHIFT is true, then
7820 we can strip off a shift also. */
7821 static rtx
7822 aarch64_strip_extend (rtx x, bool strip_shift)
7823 {
7824 scalar_int_mode mode;
7825 rtx op = x;
7826
7827 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7828 return op;
7829
7830 /* Zero and sign extraction of a widened value. */
7831 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7832 && XEXP (op, 2) == const0_rtx
7833 && GET_CODE (XEXP (op, 0)) == MULT
7834 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7835 XEXP (op, 1)))
7836 return XEXP (XEXP (op, 0), 0);
7837
7838 /* It can also be represented (for zero-extend) as an AND with an
7839 immediate. */
7840 if (GET_CODE (op) == AND
7841 && GET_CODE (XEXP (op, 0)) == MULT
7842 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7843 && CONST_INT_P (XEXP (op, 1))
7844 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7845 INTVAL (XEXP (op, 1))) != 0)
7846 return XEXP (XEXP (op, 0), 0);
7847
7848 /* Now handle extended register, as this may also have an optional
7849 left shift by 1..4. */
7850 if (strip_shift
7851 && GET_CODE (op) == ASHIFT
7852 && CONST_INT_P (XEXP (op, 1))
7853 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7854 op = XEXP (op, 0);
7855
7856 if (GET_CODE (op) == ZERO_EXTEND
7857 || GET_CODE (op) == SIGN_EXTEND)
7858 op = XEXP (op, 0);
7859
7860 if (op != x)
7861 return op;
7862
7863 return x;
7864 }
7865
7866 /* Return true iff CODE is a shift supported in combination
7867 with arithmetic instructions. */
7868
7869 static bool
7870 aarch64_shift_p (enum rtx_code code)
7871 {
7872 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7873 }
7874
7875
7876 /* Return true iff X is a cheap shift without a sign extend. */
7877
7878 static bool
7879 aarch64_cheap_mult_shift_p (rtx x)
7880 {
7881 rtx op0, op1;
7882
7883 op0 = XEXP (x, 0);
7884 op1 = XEXP (x, 1);
7885
7886 if (!(aarch64_tune_params.extra_tuning_flags
7887 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7888 return false;
7889
7890 if (GET_CODE (op0) == SIGN_EXTEND)
7891 return false;
7892
7893 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7894 && UINTVAL (op1) <= 4)
7895 return true;
7896
7897 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7898 return false;
7899
7900 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7901
7902 if (l2 > 0 && l2 <= 4)
7903 return true;
7904
7905 return false;
7906 }
7907
7908 /* Helper function for rtx cost calculation. Calculate the cost of
7909 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7910 Return the calculated cost of the expression, recursing manually in to
7911 operands where needed. */
7912
7913 static int
7914 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7915 {
7916 rtx op0, op1;
7917 const struct cpu_cost_table *extra_cost
7918 = aarch64_tune_params.insn_extra_cost;
7919 int cost = 0;
7920 bool compound_p = (outer == PLUS || outer == MINUS);
7921 machine_mode mode = GET_MODE (x);
7922
7923 gcc_checking_assert (code == MULT);
7924
7925 op0 = XEXP (x, 0);
7926 op1 = XEXP (x, 1);
7927
7928 if (VECTOR_MODE_P (mode))
7929 mode = GET_MODE_INNER (mode);
7930
7931 /* Integer multiply/fma. */
7932 if (GET_MODE_CLASS (mode) == MODE_INT)
7933 {
7934 /* The multiply will be canonicalized as a shift, cost it as such. */
7935 if (aarch64_shift_p (GET_CODE (x))
7936 || (CONST_INT_P (op1)
7937 && exact_log2 (INTVAL (op1)) > 0))
7938 {
7939 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7940 || GET_CODE (op0) == SIGN_EXTEND;
7941 if (speed)
7942 {
7943 if (compound_p)
7944 {
7945 /* If the shift is considered cheap,
7946 then don't add any cost. */
7947 if (aarch64_cheap_mult_shift_p (x))
7948 ;
7949 else if (REG_P (op1))
7950 /* ARITH + shift-by-register. */
7951 cost += extra_cost->alu.arith_shift_reg;
7952 else if (is_extend)
7953 /* ARITH + extended register. We don't have a cost field
7954 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7955 cost += extra_cost->alu.extend_arith;
7956 else
7957 /* ARITH + shift-by-immediate. */
7958 cost += extra_cost->alu.arith_shift;
7959 }
7960 else
7961 /* LSL (immediate). */
7962 cost += extra_cost->alu.shift;
7963
7964 }
7965 /* Strip extends as we will have costed them in the case above. */
7966 if (is_extend)
7967 op0 = aarch64_strip_extend (op0, true);
7968
7969 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7970
7971 return cost;
7972 }
7973
7974 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7975 compound and let the below cases handle it. After all, MNEG is a
7976 special-case alias of MSUB. */
7977 if (GET_CODE (op0) == NEG)
7978 {
7979 op0 = XEXP (op0, 0);
7980 compound_p = true;
7981 }
7982
7983 /* Integer multiplies or FMAs have zero/sign extending variants. */
7984 if ((GET_CODE (op0) == ZERO_EXTEND
7985 && GET_CODE (op1) == ZERO_EXTEND)
7986 || (GET_CODE (op0) == SIGN_EXTEND
7987 && GET_CODE (op1) == SIGN_EXTEND))
7988 {
7989 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7990 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7991
7992 if (speed)
7993 {
7994 if (compound_p)
7995 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7996 cost += extra_cost->mult[0].extend_add;
7997 else
7998 /* MUL/SMULL/UMULL. */
7999 cost += extra_cost->mult[0].extend;
8000 }
8001
8002 return cost;
8003 }
8004
8005 /* This is either an integer multiply or a MADD. In both cases
8006 we want to recurse and cost the operands. */
8007 cost += rtx_cost (op0, mode, MULT, 0, speed);
8008 cost += rtx_cost (op1, mode, MULT, 1, speed);
8009
8010 if (speed)
8011 {
8012 if (compound_p)
8013 /* MADD/MSUB. */
8014 cost += extra_cost->mult[mode == DImode].add;
8015 else
8016 /* MUL. */
8017 cost += extra_cost->mult[mode == DImode].simple;
8018 }
8019
8020 return cost;
8021 }
8022 else
8023 {
8024 if (speed)
8025 {
8026 /* Floating-point FMA/FMUL can also support negations of the
8027 operands, unless the rounding mode is upward or downward in
8028 which case FNMUL is different than FMUL with operand negation. */
8029 bool neg0 = GET_CODE (op0) == NEG;
8030 bool neg1 = GET_CODE (op1) == NEG;
8031 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8032 {
8033 if (neg0)
8034 op0 = XEXP (op0, 0);
8035 if (neg1)
8036 op1 = XEXP (op1, 0);
8037 }
8038
8039 if (compound_p)
8040 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8041 cost += extra_cost->fp[mode == DFmode].fma;
8042 else
8043 /* FMUL/FNMUL. */
8044 cost += extra_cost->fp[mode == DFmode].mult;
8045 }
8046
8047 cost += rtx_cost (op0, mode, MULT, 0, speed);
8048 cost += rtx_cost (op1, mode, MULT, 1, speed);
8049 return cost;
8050 }
8051 }
8052
8053 static int
8054 aarch64_address_cost (rtx x,
8055 machine_mode mode,
8056 addr_space_t as ATTRIBUTE_UNUSED,
8057 bool speed)
8058 {
8059 enum rtx_code c = GET_CODE (x);
8060 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8061 struct aarch64_address_info info;
8062 int cost = 0;
8063 info.shift = 0;
8064
8065 if (!aarch64_classify_address (&info, x, mode, false))
8066 {
8067 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8068 {
8069 /* This is a CONST or SYMBOL ref which will be split
8070 in a different way depending on the code model in use.
8071 Cost it through the generic infrastructure. */
8072 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8073 /* Divide through by the cost of one instruction to
8074 bring it to the same units as the address costs. */
8075 cost_symbol_ref /= COSTS_N_INSNS (1);
8076 /* The cost is then the cost of preparing the address,
8077 followed by an immediate (possibly 0) offset. */
8078 return cost_symbol_ref + addr_cost->imm_offset;
8079 }
8080 else
8081 {
8082 /* This is most likely a jump table from a case
8083 statement. */
8084 return addr_cost->register_offset;
8085 }
8086 }
8087
8088 switch (info.type)
8089 {
8090 case ADDRESS_LO_SUM:
8091 case ADDRESS_SYMBOLIC:
8092 case ADDRESS_REG_IMM:
8093 cost += addr_cost->imm_offset;
8094 break;
8095
8096 case ADDRESS_REG_WB:
8097 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8098 cost += addr_cost->pre_modify;
8099 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8100 cost += addr_cost->post_modify;
8101 else
8102 gcc_unreachable ();
8103
8104 break;
8105
8106 case ADDRESS_REG_REG:
8107 cost += addr_cost->register_offset;
8108 break;
8109
8110 case ADDRESS_REG_SXTW:
8111 cost += addr_cost->register_sextend;
8112 break;
8113
8114 case ADDRESS_REG_UXTW:
8115 cost += addr_cost->register_zextend;
8116 break;
8117
8118 default:
8119 gcc_unreachable ();
8120 }
8121
8122
8123 if (info.shift > 0)
8124 {
8125 /* For the sake of calculating the cost of the shifted register
8126 component, we can treat same sized modes in the same way. */
8127 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8128 cost += addr_cost->addr_scale_costs.hi;
8129 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8130 cost += addr_cost->addr_scale_costs.si;
8131 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8132 cost += addr_cost->addr_scale_costs.di;
8133 else
8134 /* We can't tell, or this is a 128-bit vector. */
8135 cost += addr_cost->addr_scale_costs.ti;
8136 }
8137
8138 return cost;
8139 }
8140
8141 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8142 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8143 to be taken. */
8144
8145 int
8146 aarch64_branch_cost (bool speed_p, bool predictable_p)
8147 {
8148 /* When optimizing for speed, use the cost of unpredictable branches. */
8149 const struct cpu_branch_cost *branch_costs =
8150 aarch64_tune_params.branch_costs;
8151
8152 if (!speed_p || predictable_p)
8153 return branch_costs->predictable;
8154 else
8155 return branch_costs->unpredictable;
8156 }
8157
8158 /* Return true if the RTX X in mode MODE is a zero or sign extract
8159 usable in an ADD or SUB (extended register) instruction. */
8160 static bool
8161 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8162 {
8163 /* Catch add with a sign extract.
8164 This is add_<optab><mode>_multp2. */
8165 if (GET_CODE (x) == SIGN_EXTRACT
8166 || GET_CODE (x) == ZERO_EXTRACT)
8167 {
8168 rtx op0 = XEXP (x, 0);
8169 rtx op1 = XEXP (x, 1);
8170 rtx op2 = XEXP (x, 2);
8171
8172 if (GET_CODE (op0) == MULT
8173 && CONST_INT_P (op1)
8174 && op2 == const0_rtx
8175 && CONST_INT_P (XEXP (op0, 1))
8176 && aarch64_is_extend_from_extract (mode,
8177 XEXP (op0, 1),
8178 op1))
8179 {
8180 return true;
8181 }
8182 }
8183 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8184 No shift. */
8185 else if (GET_CODE (x) == SIGN_EXTEND
8186 || GET_CODE (x) == ZERO_EXTEND)
8187 return REG_P (XEXP (x, 0));
8188
8189 return false;
8190 }
8191
8192 static bool
8193 aarch64_frint_unspec_p (unsigned int u)
8194 {
8195 switch (u)
8196 {
8197 case UNSPEC_FRINTZ:
8198 case UNSPEC_FRINTP:
8199 case UNSPEC_FRINTM:
8200 case UNSPEC_FRINTA:
8201 case UNSPEC_FRINTN:
8202 case UNSPEC_FRINTX:
8203 case UNSPEC_FRINTI:
8204 return true;
8205
8206 default:
8207 return false;
8208 }
8209 }
8210
8211 /* Return true iff X is an rtx that will match an extr instruction
8212 i.e. as described in the *extr<mode>5_insn family of patterns.
8213 OP0 and OP1 will be set to the operands of the shifts involved
8214 on success and will be NULL_RTX otherwise. */
8215
8216 static bool
8217 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8218 {
8219 rtx op0, op1;
8220 scalar_int_mode mode;
8221 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8222 return false;
8223
8224 *res_op0 = NULL_RTX;
8225 *res_op1 = NULL_RTX;
8226
8227 if (GET_CODE (x) != IOR)
8228 return false;
8229
8230 op0 = XEXP (x, 0);
8231 op1 = XEXP (x, 1);
8232
8233 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8234 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8235 {
8236 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8237 if (GET_CODE (op1) == ASHIFT)
8238 std::swap (op0, op1);
8239
8240 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8241 return false;
8242
8243 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8244 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8245
8246 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8247 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8248 {
8249 *res_op0 = XEXP (op0, 0);
8250 *res_op1 = XEXP (op1, 0);
8251 return true;
8252 }
8253 }
8254
8255 return false;
8256 }
8257
8258 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8259 storing it in *COST. Result is true if the total cost of the operation
8260 has now been calculated. */
8261 static bool
8262 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8263 {
8264 rtx inner;
8265 rtx comparator;
8266 enum rtx_code cmpcode;
8267
8268 if (COMPARISON_P (op0))
8269 {
8270 inner = XEXP (op0, 0);
8271 comparator = XEXP (op0, 1);
8272 cmpcode = GET_CODE (op0);
8273 }
8274 else
8275 {
8276 inner = op0;
8277 comparator = const0_rtx;
8278 cmpcode = NE;
8279 }
8280
8281 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8282 {
8283 /* Conditional branch. */
8284 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8285 return true;
8286 else
8287 {
8288 if (cmpcode == NE || cmpcode == EQ)
8289 {
8290 if (comparator == const0_rtx)
8291 {
8292 /* TBZ/TBNZ/CBZ/CBNZ. */
8293 if (GET_CODE (inner) == ZERO_EXTRACT)
8294 /* TBZ/TBNZ. */
8295 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8296 ZERO_EXTRACT, 0, speed);
8297 else
8298 /* CBZ/CBNZ. */
8299 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8300
8301 return true;
8302 }
8303 }
8304 else if (cmpcode == LT || cmpcode == GE)
8305 {
8306 /* TBZ/TBNZ. */
8307 if (comparator == const0_rtx)
8308 return true;
8309 }
8310 }
8311 }
8312 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8313 {
8314 /* CCMP. */
8315 if (GET_CODE (op1) == COMPARE)
8316 {
8317 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8318 if (XEXP (op1, 1) == const0_rtx)
8319 *cost += 1;
8320 if (speed)
8321 {
8322 machine_mode mode = GET_MODE (XEXP (op1, 0));
8323 const struct cpu_cost_table *extra_cost
8324 = aarch64_tune_params.insn_extra_cost;
8325
8326 if (GET_MODE_CLASS (mode) == MODE_INT)
8327 *cost += extra_cost->alu.arith;
8328 else
8329 *cost += extra_cost->fp[mode == DFmode].compare;
8330 }
8331 return true;
8332 }
8333
8334 /* It's a conditional operation based on the status flags,
8335 so it must be some flavor of CSEL. */
8336
8337 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8338 if (GET_CODE (op1) == NEG
8339 || GET_CODE (op1) == NOT
8340 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8341 op1 = XEXP (op1, 0);
8342 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8343 {
8344 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8345 op1 = XEXP (op1, 0);
8346 op2 = XEXP (op2, 0);
8347 }
8348
8349 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8350 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8351 return true;
8352 }
8353
8354 /* We don't know what this is, cost all operands. */
8355 return false;
8356 }
8357
8358 /* Check whether X is a bitfield operation of the form shift + extend that
8359 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8360 operand to which the bitfield operation is applied. Otherwise return
8361 NULL_RTX. */
8362
8363 static rtx
8364 aarch64_extend_bitfield_pattern_p (rtx x)
8365 {
8366 rtx_code outer_code = GET_CODE (x);
8367 machine_mode outer_mode = GET_MODE (x);
8368
8369 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8370 && outer_mode != SImode && outer_mode != DImode)
8371 return NULL_RTX;
8372
8373 rtx inner = XEXP (x, 0);
8374 rtx_code inner_code = GET_CODE (inner);
8375 machine_mode inner_mode = GET_MODE (inner);
8376 rtx op = NULL_RTX;
8377
8378 switch (inner_code)
8379 {
8380 case ASHIFT:
8381 if (CONST_INT_P (XEXP (inner, 1))
8382 && (inner_mode == QImode || inner_mode == HImode))
8383 op = XEXP (inner, 0);
8384 break;
8385 case LSHIFTRT:
8386 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8387 && (inner_mode == QImode || inner_mode == HImode))
8388 op = XEXP (inner, 0);
8389 break;
8390 case ASHIFTRT:
8391 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8392 && (inner_mode == QImode || inner_mode == HImode))
8393 op = XEXP (inner, 0);
8394 break;
8395 default:
8396 break;
8397 }
8398
8399 return op;
8400 }
8401
8402 /* Return true if the mask and a shift amount from an RTX of the form
8403 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8404 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8405
8406 bool
8407 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8408 rtx shft_amnt)
8409 {
8410 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8411 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8412 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8413 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8414 }
8415
8416 /* Calculate the cost of calculating X, storing it in *COST. Result
8417 is true if the total cost of the operation has now been calculated. */
8418 static bool
8419 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8420 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8421 {
8422 rtx op0, op1, op2;
8423 const struct cpu_cost_table *extra_cost
8424 = aarch64_tune_params.insn_extra_cost;
8425 int code = GET_CODE (x);
8426 scalar_int_mode int_mode;
8427
8428 /* By default, assume that everything has equivalent cost to the
8429 cheapest instruction. Any additional costs are applied as a delta
8430 above this default. */
8431 *cost = COSTS_N_INSNS (1);
8432
8433 switch (code)
8434 {
8435 case SET:
8436 /* The cost depends entirely on the operands to SET. */
8437 *cost = 0;
8438 op0 = SET_DEST (x);
8439 op1 = SET_SRC (x);
8440
8441 switch (GET_CODE (op0))
8442 {
8443 case MEM:
8444 if (speed)
8445 {
8446 rtx address = XEXP (op0, 0);
8447 if (VECTOR_MODE_P (mode))
8448 *cost += extra_cost->ldst.storev;
8449 else if (GET_MODE_CLASS (mode) == MODE_INT)
8450 *cost += extra_cost->ldst.store;
8451 else if (mode == SFmode)
8452 *cost += extra_cost->ldst.storef;
8453 else if (mode == DFmode)
8454 *cost += extra_cost->ldst.stored;
8455
8456 *cost +=
8457 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8458 0, speed));
8459 }
8460
8461 *cost += rtx_cost (op1, mode, SET, 1, speed);
8462 return true;
8463
8464 case SUBREG:
8465 if (! REG_P (SUBREG_REG (op0)))
8466 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8467
8468 /* Fall through. */
8469 case REG:
8470 /* The cost is one per vector-register copied. */
8471 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8472 {
8473 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8474 *cost = COSTS_N_INSNS (nregs);
8475 }
8476 /* const0_rtx is in general free, but we will use an
8477 instruction to set a register to 0. */
8478 else if (REG_P (op1) || op1 == const0_rtx)
8479 {
8480 /* The cost is 1 per register copied. */
8481 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8482 *cost = COSTS_N_INSNS (nregs);
8483 }
8484 else
8485 /* Cost is just the cost of the RHS of the set. */
8486 *cost += rtx_cost (op1, mode, SET, 1, speed);
8487 return true;
8488
8489 case ZERO_EXTRACT:
8490 case SIGN_EXTRACT:
8491 /* Bit-field insertion. Strip any redundant widening of
8492 the RHS to meet the width of the target. */
8493 if (GET_CODE (op1) == SUBREG)
8494 op1 = SUBREG_REG (op1);
8495 if ((GET_CODE (op1) == ZERO_EXTEND
8496 || GET_CODE (op1) == SIGN_EXTEND)
8497 && CONST_INT_P (XEXP (op0, 1))
8498 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8499 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8500 op1 = XEXP (op1, 0);
8501
8502 if (CONST_INT_P (op1))
8503 {
8504 /* MOV immediate is assumed to always be cheap. */
8505 *cost = COSTS_N_INSNS (1);
8506 }
8507 else
8508 {
8509 /* BFM. */
8510 if (speed)
8511 *cost += extra_cost->alu.bfi;
8512 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8513 }
8514
8515 return true;
8516
8517 default:
8518 /* We can't make sense of this, assume default cost. */
8519 *cost = COSTS_N_INSNS (1);
8520 return false;
8521 }
8522 return false;
8523
8524 case CONST_INT:
8525 /* If an instruction can incorporate a constant within the
8526 instruction, the instruction's expression avoids calling
8527 rtx_cost() on the constant. If rtx_cost() is called on a
8528 constant, then it is usually because the constant must be
8529 moved into a register by one or more instructions.
8530
8531 The exception is constant 0, which can be expressed
8532 as XZR/WZR and is therefore free. The exception to this is
8533 if we have (set (reg) (const0_rtx)) in which case we must cost
8534 the move. However, we can catch that when we cost the SET, so
8535 we don't need to consider that here. */
8536 if (x == const0_rtx)
8537 *cost = 0;
8538 else
8539 {
8540 /* To an approximation, building any other constant is
8541 proportionally expensive to the number of instructions
8542 required to build that constant. This is true whether we
8543 are compiling for SPEED or otherwise. */
8544 if (!is_a <scalar_int_mode> (mode, &int_mode))
8545 int_mode = word_mode;
8546 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8547 (NULL_RTX, x, false, int_mode));
8548 }
8549 return true;
8550
8551 case CONST_DOUBLE:
8552
8553 /* First determine number of instructions to do the move
8554 as an integer constant. */
8555 if (!aarch64_float_const_representable_p (x)
8556 && !aarch64_can_const_movi_rtx_p (x, mode)
8557 && aarch64_float_const_rtx_p (x))
8558 {
8559 unsigned HOST_WIDE_INT ival;
8560 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8561 gcc_assert (succeed);
8562
8563 scalar_int_mode imode = (mode == HFmode
8564 ? SImode
8565 : int_mode_for_mode (mode).require ());
8566 int ncost = aarch64_internal_mov_immediate
8567 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8568 *cost += COSTS_N_INSNS (ncost);
8569 return true;
8570 }
8571
8572 if (speed)
8573 {
8574 /* mov[df,sf]_aarch64. */
8575 if (aarch64_float_const_representable_p (x))
8576 /* FMOV (scalar immediate). */
8577 *cost += extra_cost->fp[mode == DFmode].fpconst;
8578 else if (!aarch64_float_const_zero_rtx_p (x))
8579 {
8580 /* This will be a load from memory. */
8581 if (mode == DFmode)
8582 *cost += extra_cost->ldst.loadd;
8583 else
8584 *cost += extra_cost->ldst.loadf;
8585 }
8586 else
8587 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8588 or MOV v0.s[0], wzr - neither of which are modeled by the
8589 cost tables. Just use the default cost. */
8590 {
8591 }
8592 }
8593
8594 return true;
8595
8596 case MEM:
8597 if (speed)
8598 {
8599 /* For loads we want the base cost of a load, plus an
8600 approximation for the additional cost of the addressing
8601 mode. */
8602 rtx address = XEXP (x, 0);
8603 if (VECTOR_MODE_P (mode))
8604 *cost += extra_cost->ldst.loadv;
8605 else if (GET_MODE_CLASS (mode) == MODE_INT)
8606 *cost += extra_cost->ldst.load;
8607 else if (mode == SFmode)
8608 *cost += extra_cost->ldst.loadf;
8609 else if (mode == DFmode)
8610 *cost += extra_cost->ldst.loadd;
8611
8612 *cost +=
8613 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8614 0, speed));
8615 }
8616
8617 return true;
8618
8619 case NEG:
8620 op0 = XEXP (x, 0);
8621
8622 if (VECTOR_MODE_P (mode))
8623 {
8624 if (speed)
8625 {
8626 /* FNEG. */
8627 *cost += extra_cost->vect.alu;
8628 }
8629 return false;
8630 }
8631
8632 if (GET_MODE_CLASS (mode) == MODE_INT)
8633 {
8634 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8635 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8636 {
8637 /* CSETM. */
8638 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8639 return true;
8640 }
8641
8642 /* Cost this as SUB wzr, X. */
8643 op0 = CONST0_RTX (mode);
8644 op1 = XEXP (x, 0);
8645 goto cost_minus;
8646 }
8647
8648 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8649 {
8650 /* Support (neg(fma...)) as a single instruction only if
8651 sign of zeros is unimportant. This matches the decision
8652 making in aarch64.md. */
8653 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8654 {
8655 /* FNMADD. */
8656 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8657 return true;
8658 }
8659 if (GET_CODE (op0) == MULT)
8660 {
8661 /* FNMUL. */
8662 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8663 return true;
8664 }
8665 if (speed)
8666 /* FNEG. */
8667 *cost += extra_cost->fp[mode == DFmode].neg;
8668 return false;
8669 }
8670
8671 return false;
8672
8673 case CLRSB:
8674 case CLZ:
8675 if (speed)
8676 {
8677 if (VECTOR_MODE_P (mode))
8678 *cost += extra_cost->vect.alu;
8679 else
8680 *cost += extra_cost->alu.clz;
8681 }
8682
8683 return false;
8684
8685 case COMPARE:
8686 op0 = XEXP (x, 0);
8687 op1 = XEXP (x, 1);
8688
8689 if (op1 == const0_rtx
8690 && GET_CODE (op0) == AND)
8691 {
8692 x = op0;
8693 mode = GET_MODE (op0);
8694 goto cost_logic;
8695 }
8696
8697 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8698 {
8699 /* TODO: A write to the CC flags possibly costs extra, this
8700 needs encoding in the cost tables. */
8701
8702 mode = GET_MODE (op0);
8703 /* ANDS. */
8704 if (GET_CODE (op0) == AND)
8705 {
8706 x = op0;
8707 goto cost_logic;
8708 }
8709
8710 if (GET_CODE (op0) == PLUS)
8711 {
8712 /* ADDS (and CMN alias). */
8713 x = op0;
8714 goto cost_plus;
8715 }
8716
8717 if (GET_CODE (op0) == MINUS)
8718 {
8719 /* SUBS. */
8720 x = op0;
8721 goto cost_minus;
8722 }
8723
8724 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8725 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8726 && CONST_INT_P (XEXP (op0, 2)))
8727 {
8728 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8729 Handle it here directly rather than going to cost_logic
8730 since we know the immediate generated for the TST is valid
8731 so we can avoid creating an intermediate rtx for it only
8732 for costing purposes. */
8733 if (speed)
8734 *cost += extra_cost->alu.logical;
8735
8736 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8737 ZERO_EXTRACT, 0, speed);
8738 return true;
8739 }
8740
8741 if (GET_CODE (op1) == NEG)
8742 {
8743 /* CMN. */
8744 if (speed)
8745 *cost += extra_cost->alu.arith;
8746
8747 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8748 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8749 return true;
8750 }
8751
8752 /* CMP.
8753
8754 Compare can freely swap the order of operands, and
8755 canonicalization puts the more complex operation first.
8756 But the integer MINUS logic expects the shift/extend
8757 operation in op1. */
8758 if (! (REG_P (op0)
8759 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8760 {
8761 op0 = XEXP (x, 1);
8762 op1 = XEXP (x, 0);
8763 }
8764 goto cost_minus;
8765 }
8766
8767 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8768 {
8769 /* FCMP. */
8770 if (speed)
8771 *cost += extra_cost->fp[mode == DFmode].compare;
8772
8773 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8774 {
8775 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8776 /* FCMP supports constant 0.0 for no extra cost. */
8777 return true;
8778 }
8779 return false;
8780 }
8781
8782 if (VECTOR_MODE_P (mode))
8783 {
8784 /* Vector compare. */
8785 if (speed)
8786 *cost += extra_cost->vect.alu;
8787
8788 if (aarch64_float_const_zero_rtx_p (op1))
8789 {
8790 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8791 cost. */
8792 return true;
8793 }
8794 return false;
8795 }
8796 return false;
8797
8798 case MINUS:
8799 {
8800 op0 = XEXP (x, 0);
8801 op1 = XEXP (x, 1);
8802
8803 cost_minus:
8804 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8805
8806 /* Detect valid immediates. */
8807 if ((GET_MODE_CLASS (mode) == MODE_INT
8808 || (GET_MODE_CLASS (mode) == MODE_CC
8809 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8810 && CONST_INT_P (op1)
8811 && aarch64_uimm12_shift (INTVAL (op1)))
8812 {
8813 if (speed)
8814 /* SUB(S) (immediate). */
8815 *cost += extra_cost->alu.arith;
8816 return true;
8817 }
8818
8819 /* Look for SUB (extended register). */
8820 if (is_a <scalar_int_mode> (mode, &int_mode)
8821 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8822 {
8823 if (speed)
8824 *cost += extra_cost->alu.extend_arith;
8825
8826 op1 = aarch64_strip_extend (op1, true);
8827 *cost += rtx_cost (op1, VOIDmode,
8828 (enum rtx_code) GET_CODE (op1), 0, speed);
8829 return true;
8830 }
8831
8832 rtx new_op1 = aarch64_strip_extend (op1, false);
8833
8834 /* Cost this as an FMA-alike operation. */
8835 if ((GET_CODE (new_op1) == MULT
8836 || aarch64_shift_p (GET_CODE (new_op1)))
8837 && code != COMPARE)
8838 {
8839 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8840 (enum rtx_code) code,
8841 speed);
8842 return true;
8843 }
8844
8845 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8846
8847 if (speed)
8848 {
8849 if (VECTOR_MODE_P (mode))
8850 {
8851 /* Vector SUB. */
8852 *cost += extra_cost->vect.alu;
8853 }
8854 else if (GET_MODE_CLASS (mode) == MODE_INT)
8855 {
8856 /* SUB(S). */
8857 *cost += extra_cost->alu.arith;
8858 }
8859 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8860 {
8861 /* FSUB. */
8862 *cost += extra_cost->fp[mode == DFmode].addsub;
8863 }
8864 }
8865 return true;
8866 }
8867
8868 case PLUS:
8869 {
8870 rtx new_op0;
8871
8872 op0 = XEXP (x, 0);
8873 op1 = XEXP (x, 1);
8874
8875 cost_plus:
8876 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8877 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8878 {
8879 /* CSINC. */
8880 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8881 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8882 return true;
8883 }
8884
8885 if (GET_MODE_CLASS (mode) == MODE_INT
8886 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8887 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8888 {
8889 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8890
8891 if (speed)
8892 /* ADD (immediate). */
8893 *cost += extra_cost->alu.arith;
8894 return true;
8895 }
8896
8897 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8898
8899 /* Look for ADD (extended register). */
8900 if (is_a <scalar_int_mode> (mode, &int_mode)
8901 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8902 {
8903 if (speed)
8904 *cost += extra_cost->alu.extend_arith;
8905
8906 op0 = aarch64_strip_extend (op0, true);
8907 *cost += rtx_cost (op0, VOIDmode,
8908 (enum rtx_code) GET_CODE (op0), 0, speed);
8909 return true;
8910 }
8911
8912 /* Strip any extend, leave shifts behind as we will
8913 cost them through mult_cost. */
8914 new_op0 = aarch64_strip_extend (op0, false);
8915
8916 if (GET_CODE (new_op0) == MULT
8917 || aarch64_shift_p (GET_CODE (new_op0)))
8918 {
8919 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8920 speed);
8921 return true;
8922 }
8923
8924 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8925
8926 if (speed)
8927 {
8928 if (VECTOR_MODE_P (mode))
8929 {
8930 /* Vector ADD. */
8931 *cost += extra_cost->vect.alu;
8932 }
8933 else if (GET_MODE_CLASS (mode) == MODE_INT)
8934 {
8935 /* ADD. */
8936 *cost += extra_cost->alu.arith;
8937 }
8938 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8939 {
8940 /* FADD. */
8941 *cost += extra_cost->fp[mode == DFmode].addsub;
8942 }
8943 }
8944 return true;
8945 }
8946
8947 case BSWAP:
8948 *cost = COSTS_N_INSNS (1);
8949
8950 if (speed)
8951 {
8952 if (VECTOR_MODE_P (mode))
8953 *cost += extra_cost->vect.alu;
8954 else
8955 *cost += extra_cost->alu.rev;
8956 }
8957 return false;
8958
8959 case IOR:
8960 if (aarch_rev16_p (x))
8961 {
8962 *cost = COSTS_N_INSNS (1);
8963
8964 if (speed)
8965 {
8966 if (VECTOR_MODE_P (mode))
8967 *cost += extra_cost->vect.alu;
8968 else
8969 *cost += extra_cost->alu.rev;
8970 }
8971 return true;
8972 }
8973
8974 if (aarch64_extr_rtx_p (x, &op0, &op1))
8975 {
8976 *cost += rtx_cost (op0, mode, IOR, 0, speed);
8977 *cost += rtx_cost (op1, mode, IOR, 1, speed);
8978 if (speed)
8979 *cost += extra_cost->alu.shift;
8980
8981 return true;
8982 }
8983 /* Fall through. */
8984 case XOR:
8985 case AND:
8986 cost_logic:
8987 op0 = XEXP (x, 0);
8988 op1 = XEXP (x, 1);
8989
8990 if (VECTOR_MODE_P (mode))
8991 {
8992 if (speed)
8993 *cost += extra_cost->vect.alu;
8994 return true;
8995 }
8996
8997 if (code == AND
8998 && GET_CODE (op0) == MULT
8999 && CONST_INT_P (XEXP (op0, 1))
9000 && CONST_INT_P (op1)
9001 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9002 INTVAL (op1)) != 0)
9003 {
9004 /* This is a UBFM/SBFM. */
9005 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9006 if (speed)
9007 *cost += extra_cost->alu.bfx;
9008 return true;
9009 }
9010
9011 if (is_int_mode (mode, &int_mode))
9012 {
9013 if (CONST_INT_P (op1))
9014 {
9015 /* We have a mask + shift version of a UBFIZ
9016 i.e. the *andim_ashift<mode>_bfiz pattern. */
9017 if (GET_CODE (op0) == ASHIFT
9018 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9019 XEXP (op0, 1)))
9020 {
9021 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9022 (enum rtx_code) code, 0, speed);
9023 if (speed)
9024 *cost += extra_cost->alu.bfx;
9025
9026 return true;
9027 }
9028 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9029 {
9030 /* We possibly get the immediate for free, this is not
9031 modelled. */
9032 *cost += rtx_cost (op0, int_mode,
9033 (enum rtx_code) code, 0, speed);
9034 if (speed)
9035 *cost += extra_cost->alu.logical;
9036
9037 return true;
9038 }
9039 }
9040 else
9041 {
9042 rtx new_op0 = op0;
9043
9044 /* Handle ORN, EON, or BIC. */
9045 if (GET_CODE (op0) == NOT)
9046 op0 = XEXP (op0, 0);
9047
9048 new_op0 = aarch64_strip_shift (op0);
9049
9050 /* If we had a shift on op0 then this is a logical-shift-
9051 by-register/immediate operation. Otherwise, this is just
9052 a logical operation. */
9053 if (speed)
9054 {
9055 if (new_op0 != op0)
9056 {
9057 /* Shift by immediate. */
9058 if (CONST_INT_P (XEXP (op0, 1)))
9059 *cost += extra_cost->alu.log_shift;
9060 else
9061 *cost += extra_cost->alu.log_shift_reg;
9062 }
9063 else
9064 *cost += extra_cost->alu.logical;
9065 }
9066
9067 /* In both cases we want to cost both operands. */
9068 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9069 0, speed);
9070 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9071 1, speed);
9072
9073 return true;
9074 }
9075 }
9076 return false;
9077
9078 case NOT:
9079 x = XEXP (x, 0);
9080 op0 = aarch64_strip_shift (x);
9081
9082 if (VECTOR_MODE_P (mode))
9083 {
9084 /* Vector NOT. */
9085 *cost += extra_cost->vect.alu;
9086 return false;
9087 }
9088
9089 /* MVN-shifted-reg. */
9090 if (op0 != x)
9091 {
9092 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9093
9094 if (speed)
9095 *cost += extra_cost->alu.log_shift;
9096
9097 return true;
9098 }
9099 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9100 Handle the second form here taking care that 'a' in the above can
9101 be a shift. */
9102 else if (GET_CODE (op0) == XOR)
9103 {
9104 rtx newop0 = XEXP (op0, 0);
9105 rtx newop1 = XEXP (op0, 1);
9106 rtx op0_stripped = aarch64_strip_shift (newop0);
9107
9108 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9109 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9110
9111 if (speed)
9112 {
9113 if (op0_stripped != newop0)
9114 *cost += extra_cost->alu.log_shift;
9115 else
9116 *cost += extra_cost->alu.logical;
9117 }
9118
9119 return true;
9120 }
9121 /* MVN. */
9122 if (speed)
9123 *cost += extra_cost->alu.logical;
9124
9125 return false;
9126
9127 case ZERO_EXTEND:
9128
9129 op0 = XEXP (x, 0);
9130 /* If a value is written in SI mode, then zero extended to DI
9131 mode, the operation will in general be free as a write to
9132 a 'w' register implicitly zeroes the upper bits of an 'x'
9133 register. However, if this is
9134
9135 (set (reg) (zero_extend (reg)))
9136
9137 we must cost the explicit register move. */
9138 if (mode == DImode
9139 && GET_MODE (op0) == SImode
9140 && outer == SET)
9141 {
9142 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9143
9144 /* If OP_COST is non-zero, then the cost of the zero extend
9145 is effectively the cost of the inner operation. Otherwise
9146 we have a MOV instruction and we take the cost from the MOV
9147 itself. This is true independently of whether we are
9148 optimizing for space or time. */
9149 if (op_cost)
9150 *cost = op_cost;
9151
9152 return true;
9153 }
9154 else if (MEM_P (op0))
9155 {
9156 /* All loads can zero extend to any size for free. */
9157 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9158 return true;
9159 }
9160
9161 op0 = aarch64_extend_bitfield_pattern_p (x);
9162 if (op0)
9163 {
9164 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9165 if (speed)
9166 *cost += extra_cost->alu.bfx;
9167 return true;
9168 }
9169
9170 if (speed)
9171 {
9172 if (VECTOR_MODE_P (mode))
9173 {
9174 /* UMOV. */
9175 *cost += extra_cost->vect.alu;
9176 }
9177 else
9178 {
9179 /* We generate an AND instead of UXTB/UXTH. */
9180 *cost += extra_cost->alu.logical;
9181 }
9182 }
9183 return false;
9184
9185 case SIGN_EXTEND:
9186 if (MEM_P (XEXP (x, 0)))
9187 {
9188 /* LDRSH. */
9189 if (speed)
9190 {
9191 rtx address = XEXP (XEXP (x, 0), 0);
9192 *cost += extra_cost->ldst.load_sign_extend;
9193
9194 *cost +=
9195 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9196 0, speed));
9197 }
9198 return true;
9199 }
9200
9201 op0 = aarch64_extend_bitfield_pattern_p (x);
9202 if (op0)
9203 {
9204 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9205 if (speed)
9206 *cost += extra_cost->alu.bfx;
9207 return true;
9208 }
9209
9210 if (speed)
9211 {
9212 if (VECTOR_MODE_P (mode))
9213 *cost += extra_cost->vect.alu;
9214 else
9215 *cost += extra_cost->alu.extend;
9216 }
9217 return false;
9218
9219 case ASHIFT:
9220 op0 = XEXP (x, 0);
9221 op1 = XEXP (x, 1);
9222
9223 if (CONST_INT_P (op1))
9224 {
9225 if (speed)
9226 {
9227 if (VECTOR_MODE_P (mode))
9228 {
9229 /* Vector shift (immediate). */
9230 *cost += extra_cost->vect.alu;
9231 }
9232 else
9233 {
9234 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9235 aliases. */
9236 *cost += extra_cost->alu.shift;
9237 }
9238 }
9239
9240 /* We can incorporate zero/sign extend for free. */
9241 if (GET_CODE (op0) == ZERO_EXTEND
9242 || GET_CODE (op0) == SIGN_EXTEND)
9243 op0 = XEXP (op0, 0);
9244
9245 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9246 return true;
9247 }
9248 else
9249 {
9250 if (VECTOR_MODE_P (mode))
9251 {
9252 if (speed)
9253 /* Vector shift (register). */
9254 *cost += extra_cost->vect.alu;
9255 }
9256 else
9257 {
9258 if (speed)
9259 /* LSLV. */
9260 *cost += extra_cost->alu.shift_reg;
9261
9262 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9263 && CONST_INT_P (XEXP (op1, 1))
9264 && known_eq (INTVAL (XEXP (op1, 1)),
9265 GET_MODE_BITSIZE (mode) - 1))
9266 {
9267 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9268 /* We already demanded XEXP (op1, 0) to be REG_P, so
9269 don't recurse into it. */
9270 return true;
9271 }
9272 }
9273 return false; /* All arguments need to be in registers. */
9274 }
9275
9276 case ROTATE:
9277 case ROTATERT:
9278 case LSHIFTRT:
9279 case ASHIFTRT:
9280 op0 = XEXP (x, 0);
9281 op1 = XEXP (x, 1);
9282
9283 if (CONST_INT_P (op1))
9284 {
9285 /* ASR (immediate) and friends. */
9286 if (speed)
9287 {
9288 if (VECTOR_MODE_P (mode))
9289 *cost += extra_cost->vect.alu;
9290 else
9291 *cost += extra_cost->alu.shift;
9292 }
9293
9294 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9295 return true;
9296 }
9297 else
9298 {
9299 if (VECTOR_MODE_P (mode))
9300 {
9301 if (speed)
9302 /* Vector shift (register). */
9303 *cost += extra_cost->vect.alu;
9304 }
9305 else
9306 {
9307 if (speed)
9308 /* ASR (register) and friends. */
9309 *cost += extra_cost->alu.shift_reg;
9310
9311 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9312 && CONST_INT_P (XEXP (op1, 1))
9313 && known_eq (INTVAL (XEXP (op1, 1)),
9314 GET_MODE_BITSIZE (mode) - 1))
9315 {
9316 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9317 /* We already demanded XEXP (op1, 0) to be REG_P, so
9318 don't recurse into it. */
9319 return true;
9320 }
9321 }
9322 return false; /* All arguments need to be in registers. */
9323 }
9324
9325 case SYMBOL_REF:
9326
9327 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9328 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9329 {
9330 /* LDR. */
9331 if (speed)
9332 *cost += extra_cost->ldst.load;
9333 }
9334 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9335 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9336 {
9337 /* ADRP, followed by ADD. */
9338 *cost += COSTS_N_INSNS (1);
9339 if (speed)
9340 *cost += 2 * extra_cost->alu.arith;
9341 }
9342 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9343 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9344 {
9345 /* ADR. */
9346 if (speed)
9347 *cost += extra_cost->alu.arith;
9348 }
9349
9350 if (flag_pic)
9351 {
9352 /* One extra load instruction, after accessing the GOT. */
9353 *cost += COSTS_N_INSNS (1);
9354 if (speed)
9355 *cost += extra_cost->ldst.load;
9356 }
9357 return true;
9358
9359 case HIGH:
9360 case LO_SUM:
9361 /* ADRP/ADD (immediate). */
9362 if (speed)
9363 *cost += extra_cost->alu.arith;
9364 return true;
9365
9366 case ZERO_EXTRACT:
9367 case SIGN_EXTRACT:
9368 /* UBFX/SBFX. */
9369 if (speed)
9370 {
9371 if (VECTOR_MODE_P (mode))
9372 *cost += extra_cost->vect.alu;
9373 else
9374 *cost += extra_cost->alu.bfx;
9375 }
9376
9377 /* We can trust that the immediates used will be correct (there
9378 are no by-register forms), so we need only cost op0. */
9379 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9380 return true;
9381
9382 case MULT:
9383 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9384 /* aarch64_rtx_mult_cost always handles recursion to its
9385 operands. */
9386 return true;
9387
9388 case MOD:
9389 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9390 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9391 an unconditional negate. This case should only ever be reached through
9392 the set_smod_pow2_cheap check in expmed.c. */
9393 if (CONST_INT_P (XEXP (x, 1))
9394 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9395 && (mode == SImode || mode == DImode))
9396 {
9397 /* We expand to 4 instructions. Reset the baseline. */
9398 *cost = COSTS_N_INSNS (4);
9399
9400 if (speed)
9401 *cost += 2 * extra_cost->alu.logical
9402 + 2 * extra_cost->alu.arith;
9403
9404 return true;
9405 }
9406
9407 /* Fall-through. */
9408 case UMOD:
9409 if (speed)
9410 {
9411 /* Slighly prefer UMOD over SMOD. */
9412 if (VECTOR_MODE_P (mode))
9413 *cost += extra_cost->vect.alu;
9414 else if (GET_MODE_CLASS (mode) == MODE_INT)
9415 *cost += (extra_cost->mult[mode == DImode].add
9416 + extra_cost->mult[mode == DImode].idiv
9417 + (code == MOD ? 1 : 0));
9418 }
9419 return false; /* All arguments need to be in registers. */
9420
9421 case DIV:
9422 case UDIV:
9423 case SQRT:
9424 if (speed)
9425 {
9426 if (VECTOR_MODE_P (mode))
9427 *cost += extra_cost->vect.alu;
9428 else if (GET_MODE_CLASS (mode) == MODE_INT)
9429 /* There is no integer SQRT, so only DIV and UDIV can get
9430 here. */
9431 *cost += (extra_cost->mult[mode == DImode].idiv
9432 /* Slighly prefer UDIV over SDIV. */
9433 + (code == DIV ? 1 : 0));
9434 else
9435 *cost += extra_cost->fp[mode == DFmode].div;
9436 }
9437 return false; /* All arguments need to be in registers. */
9438
9439 case IF_THEN_ELSE:
9440 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9441 XEXP (x, 2), cost, speed);
9442
9443 case EQ:
9444 case NE:
9445 case GT:
9446 case GTU:
9447 case LT:
9448 case LTU:
9449 case GE:
9450 case GEU:
9451 case LE:
9452 case LEU:
9453
9454 return false; /* All arguments must be in registers. */
9455
9456 case FMA:
9457 op0 = XEXP (x, 0);
9458 op1 = XEXP (x, 1);
9459 op2 = XEXP (x, 2);
9460
9461 if (speed)
9462 {
9463 if (VECTOR_MODE_P (mode))
9464 *cost += extra_cost->vect.alu;
9465 else
9466 *cost += extra_cost->fp[mode == DFmode].fma;
9467 }
9468
9469 /* FMSUB, FNMADD, and FNMSUB are free. */
9470 if (GET_CODE (op0) == NEG)
9471 op0 = XEXP (op0, 0);
9472
9473 if (GET_CODE (op2) == NEG)
9474 op2 = XEXP (op2, 0);
9475
9476 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9477 and the by-element operand as operand 0. */
9478 if (GET_CODE (op1) == NEG)
9479 op1 = XEXP (op1, 0);
9480
9481 /* Catch vector-by-element operations. The by-element operand can
9482 either be (vec_duplicate (vec_select (x))) or just
9483 (vec_select (x)), depending on whether we are multiplying by
9484 a vector or a scalar.
9485
9486 Canonicalization is not very good in these cases, FMA4 will put the
9487 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9488 if (GET_CODE (op0) == VEC_DUPLICATE)
9489 op0 = XEXP (op0, 0);
9490 else if (GET_CODE (op1) == VEC_DUPLICATE)
9491 op1 = XEXP (op1, 0);
9492
9493 if (GET_CODE (op0) == VEC_SELECT)
9494 op0 = XEXP (op0, 0);
9495 else if (GET_CODE (op1) == VEC_SELECT)
9496 op1 = XEXP (op1, 0);
9497
9498 /* If the remaining parameters are not registers,
9499 get the cost to put them into registers. */
9500 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9501 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9502 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9503 return true;
9504
9505 case FLOAT:
9506 case UNSIGNED_FLOAT:
9507 if (speed)
9508 *cost += extra_cost->fp[mode == DFmode].fromint;
9509 return false;
9510
9511 case FLOAT_EXTEND:
9512 if (speed)
9513 {
9514 if (VECTOR_MODE_P (mode))
9515 {
9516 /*Vector truncate. */
9517 *cost += extra_cost->vect.alu;
9518 }
9519 else
9520 *cost += extra_cost->fp[mode == DFmode].widen;
9521 }
9522 return false;
9523
9524 case FLOAT_TRUNCATE:
9525 if (speed)
9526 {
9527 if (VECTOR_MODE_P (mode))
9528 {
9529 /*Vector conversion. */
9530 *cost += extra_cost->vect.alu;
9531 }
9532 else
9533 *cost += extra_cost->fp[mode == DFmode].narrow;
9534 }
9535 return false;
9536
9537 case FIX:
9538 case UNSIGNED_FIX:
9539 x = XEXP (x, 0);
9540 /* Strip the rounding part. They will all be implemented
9541 by the fcvt* family of instructions anyway. */
9542 if (GET_CODE (x) == UNSPEC)
9543 {
9544 unsigned int uns_code = XINT (x, 1);
9545
9546 if (uns_code == UNSPEC_FRINTA
9547 || uns_code == UNSPEC_FRINTM
9548 || uns_code == UNSPEC_FRINTN
9549 || uns_code == UNSPEC_FRINTP
9550 || uns_code == UNSPEC_FRINTZ)
9551 x = XVECEXP (x, 0, 0);
9552 }
9553
9554 if (speed)
9555 {
9556 if (VECTOR_MODE_P (mode))
9557 *cost += extra_cost->vect.alu;
9558 else
9559 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9560 }
9561
9562 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9563 fixed-point fcvt. */
9564 if (GET_CODE (x) == MULT
9565 && ((VECTOR_MODE_P (mode)
9566 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9567 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9568 {
9569 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9570 0, speed);
9571 return true;
9572 }
9573
9574 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9575 return true;
9576
9577 case ABS:
9578 if (VECTOR_MODE_P (mode))
9579 {
9580 /* ABS (vector). */
9581 if (speed)
9582 *cost += extra_cost->vect.alu;
9583 }
9584 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9585 {
9586 op0 = XEXP (x, 0);
9587
9588 /* FABD, which is analogous to FADD. */
9589 if (GET_CODE (op0) == MINUS)
9590 {
9591 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9592 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9593 if (speed)
9594 *cost += extra_cost->fp[mode == DFmode].addsub;
9595
9596 return true;
9597 }
9598 /* Simple FABS is analogous to FNEG. */
9599 if (speed)
9600 *cost += extra_cost->fp[mode == DFmode].neg;
9601 }
9602 else
9603 {
9604 /* Integer ABS will either be split to
9605 two arithmetic instructions, or will be an ABS
9606 (scalar), which we don't model. */
9607 *cost = COSTS_N_INSNS (2);
9608 if (speed)
9609 *cost += 2 * extra_cost->alu.arith;
9610 }
9611 return false;
9612
9613 case SMAX:
9614 case SMIN:
9615 if (speed)
9616 {
9617 if (VECTOR_MODE_P (mode))
9618 *cost += extra_cost->vect.alu;
9619 else
9620 {
9621 /* FMAXNM/FMINNM/FMAX/FMIN.
9622 TODO: This may not be accurate for all implementations, but
9623 we do not model this in the cost tables. */
9624 *cost += extra_cost->fp[mode == DFmode].addsub;
9625 }
9626 }
9627 return false;
9628
9629 case UNSPEC:
9630 /* The floating point round to integer frint* instructions. */
9631 if (aarch64_frint_unspec_p (XINT (x, 1)))
9632 {
9633 if (speed)
9634 *cost += extra_cost->fp[mode == DFmode].roundint;
9635
9636 return false;
9637 }
9638
9639 if (XINT (x, 1) == UNSPEC_RBIT)
9640 {
9641 if (speed)
9642 *cost += extra_cost->alu.rev;
9643
9644 return false;
9645 }
9646 break;
9647
9648 case TRUNCATE:
9649
9650 /* Decompose <su>muldi3_highpart. */
9651 if (/* (truncate:DI */
9652 mode == DImode
9653 /* (lshiftrt:TI */
9654 && GET_MODE (XEXP (x, 0)) == TImode
9655 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9656 /* (mult:TI */
9657 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9658 /* (ANY_EXTEND:TI (reg:DI))
9659 (ANY_EXTEND:TI (reg:DI))) */
9660 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9661 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9662 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9663 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9664 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9665 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9666 /* (const_int 64) */
9667 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9668 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9669 {
9670 /* UMULH/SMULH. */
9671 if (speed)
9672 *cost += extra_cost->mult[mode == DImode].extend;
9673 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9674 mode, MULT, 0, speed);
9675 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9676 mode, MULT, 1, speed);
9677 return true;
9678 }
9679
9680 /* Fall through. */
9681 default:
9682 break;
9683 }
9684
9685 if (dump_file
9686 && flag_aarch64_verbose_cost)
9687 fprintf (dump_file,
9688 "\nFailed to cost RTX. Assuming default cost.\n");
9689
9690 return true;
9691 }
9692
9693 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9694 calculated for X. This cost is stored in *COST. Returns true
9695 if the total cost of X was calculated. */
9696 static bool
9697 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9698 int param, int *cost, bool speed)
9699 {
9700 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9701
9702 if (dump_file
9703 && flag_aarch64_verbose_cost)
9704 {
9705 print_rtl_single (dump_file, x);
9706 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9707 speed ? "Hot" : "Cold",
9708 *cost, result ? "final" : "partial");
9709 }
9710
9711 return result;
9712 }
9713
9714 static int
9715 aarch64_register_move_cost (machine_mode mode,
9716 reg_class_t from_i, reg_class_t to_i)
9717 {
9718 enum reg_class from = (enum reg_class) from_i;
9719 enum reg_class to = (enum reg_class) to_i;
9720 const struct cpu_regmove_cost *regmove_cost
9721 = aarch64_tune_params.regmove_cost;
9722
9723 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9724 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9725 to = GENERAL_REGS;
9726
9727 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9728 from = GENERAL_REGS;
9729
9730 /* Moving between GPR and stack cost is the same as GP2GP. */
9731 if ((from == GENERAL_REGS && to == STACK_REG)
9732 || (to == GENERAL_REGS && from == STACK_REG))
9733 return regmove_cost->GP2GP;
9734
9735 /* To/From the stack register, we move via the gprs. */
9736 if (to == STACK_REG || from == STACK_REG)
9737 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9738 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9739
9740 if (known_eq (GET_MODE_SIZE (mode), 16))
9741 {
9742 /* 128-bit operations on general registers require 2 instructions. */
9743 if (from == GENERAL_REGS && to == GENERAL_REGS)
9744 return regmove_cost->GP2GP * 2;
9745 else if (from == GENERAL_REGS)
9746 return regmove_cost->GP2FP * 2;
9747 else if (to == GENERAL_REGS)
9748 return regmove_cost->FP2GP * 2;
9749
9750 /* When AdvSIMD instructions are disabled it is not possible to move
9751 a 128-bit value directly between Q registers. This is handled in
9752 secondary reload. A general register is used as a scratch to move
9753 the upper DI value and the lower DI value is moved directly,
9754 hence the cost is the sum of three moves. */
9755 if (! TARGET_SIMD)
9756 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9757
9758 return regmove_cost->FP2FP;
9759 }
9760
9761 if (from == GENERAL_REGS && to == GENERAL_REGS)
9762 return regmove_cost->GP2GP;
9763 else if (from == GENERAL_REGS)
9764 return regmove_cost->GP2FP;
9765 else if (to == GENERAL_REGS)
9766 return regmove_cost->FP2GP;
9767
9768 return regmove_cost->FP2FP;
9769 }
9770
9771 static int
9772 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9773 reg_class_t rclass ATTRIBUTE_UNUSED,
9774 bool in ATTRIBUTE_UNUSED)
9775 {
9776 return aarch64_tune_params.memmov_cost;
9777 }
9778
9779 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9780 to optimize 1.0/sqrt. */
9781
9782 static bool
9783 use_rsqrt_p (machine_mode mode)
9784 {
9785 return (!flag_trapping_math
9786 && flag_unsafe_math_optimizations
9787 && ((aarch64_tune_params.approx_modes->recip_sqrt
9788 & AARCH64_APPROX_MODE (mode))
9789 || flag_mrecip_low_precision_sqrt));
9790 }
9791
9792 /* Function to decide when to use the approximate reciprocal square root
9793 builtin. */
9794
9795 static tree
9796 aarch64_builtin_reciprocal (tree fndecl)
9797 {
9798 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9799
9800 if (!use_rsqrt_p (mode))
9801 return NULL_TREE;
9802 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9803 }
9804
9805 typedef rtx (*rsqrte_type) (rtx, rtx);
9806
9807 /* Select reciprocal square root initial estimate insn depending on machine
9808 mode. */
9809
9810 static rsqrte_type
9811 get_rsqrte_type (machine_mode mode)
9812 {
9813 switch (mode)
9814 {
9815 case E_DFmode: return gen_aarch64_rsqrtedf;
9816 case E_SFmode: return gen_aarch64_rsqrtesf;
9817 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9818 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9819 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9820 default: gcc_unreachable ();
9821 }
9822 }
9823
9824 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9825
9826 /* Select reciprocal square root series step insn depending on machine mode. */
9827
9828 static rsqrts_type
9829 get_rsqrts_type (machine_mode mode)
9830 {
9831 switch (mode)
9832 {
9833 case E_DFmode: return gen_aarch64_rsqrtsdf;
9834 case E_SFmode: return gen_aarch64_rsqrtssf;
9835 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9836 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9837 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9838 default: gcc_unreachable ();
9839 }
9840 }
9841
9842 /* Emit instruction sequence to compute either the approximate square root
9843 or its approximate reciprocal, depending on the flag RECP, and return
9844 whether the sequence was emitted or not. */
9845
9846 bool
9847 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9848 {
9849 machine_mode mode = GET_MODE (dst);
9850
9851 if (GET_MODE_INNER (mode) == HFmode)
9852 {
9853 gcc_assert (!recp);
9854 return false;
9855 }
9856
9857 if (!recp)
9858 {
9859 if (!(flag_mlow_precision_sqrt
9860 || (aarch64_tune_params.approx_modes->sqrt
9861 & AARCH64_APPROX_MODE (mode))))
9862 return false;
9863
9864 if (flag_finite_math_only
9865 || flag_trapping_math
9866 || !flag_unsafe_math_optimizations
9867 || optimize_function_for_size_p (cfun))
9868 return false;
9869 }
9870 else
9871 /* Caller assumes we cannot fail. */
9872 gcc_assert (use_rsqrt_p (mode));
9873
9874 machine_mode mmsk = mode_for_int_vector (mode).require ();
9875 rtx xmsk = gen_reg_rtx (mmsk);
9876 if (!recp)
9877 /* When calculating the approximate square root, compare the
9878 argument with 0.0 and create a mask. */
9879 emit_insn (gen_rtx_SET (xmsk,
9880 gen_rtx_NEG (mmsk,
9881 gen_rtx_EQ (mmsk, src,
9882 CONST0_RTX (mode)))));
9883
9884 /* Estimate the approximate reciprocal square root. */
9885 rtx xdst = gen_reg_rtx (mode);
9886 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9887
9888 /* Iterate over the series twice for SF and thrice for DF. */
9889 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9890
9891 /* Optionally iterate over the series once less for faster performance
9892 while sacrificing the accuracy. */
9893 if ((recp && flag_mrecip_low_precision_sqrt)
9894 || (!recp && flag_mlow_precision_sqrt))
9895 iterations--;
9896
9897 /* Iterate over the series to calculate the approximate reciprocal square
9898 root. */
9899 rtx x1 = gen_reg_rtx (mode);
9900 while (iterations--)
9901 {
9902 rtx x2 = gen_reg_rtx (mode);
9903 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9904
9905 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9906
9907 if (iterations > 0)
9908 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9909 }
9910
9911 if (!recp)
9912 {
9913 /* Qualify the approximate reciprocal square root when the argument is
9914 0.0 by squashing the intermediary result to 0.0. */
9915 rtx xtmp = gen_reg_rtx (mmsk);
9916 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9917 gen_rtx_SUBREG (mmsk, xdst, 0)));
9918 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9919
9920 /* Calculate the approximate square root. */
9921 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9922 }
9923
9924 /* Finalize the approximation. */
9925 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9926
9927 return true;
9928 }
9929
9930 typedef rtx (*recpe_type) (rtx, rtx);
9931
9932 /* Select reciprocal initial estimate insn depending on machine mode. */
9933
9934 static recpe_type
9935 get_recpe_type (machine_mode mode)
9936 {
9937 switch (mode)
9938 {
9939 case E_SFmode: return (gen_aarch64_frecpesf);
9940 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9941 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9942 case E_DFmode: return (gen_aarch64_frecpedf);
9943 case E_V2DFmode: return (gen_aarch64_frecpev2df);
9944 default: gcc_unreachable ();
9945 }
9946 }
9947
9948 typedef rtx (*recps_type) (rtx, rtx, rtx);
9949
9950 /* Select reciprocal series step insn depending on machine mode. */
9951
9952 static recps_type
9953 get_recps_type (machine_mode mode)
9954 {
9955 switch (mode)
9956 {
9957 case E_SFmode: return (gen_aarch64_frecpssf);
9958 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9959 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9960 case E_DFmode: return (gen_aarch64_frecpsdf);
9961 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9962 default: gcc_unreachable ();
9963 }
9964 }
9965
9966 /* Emit the instruction sequence to compute the approximation for the division
9967 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9968
9969 bool
9970 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9971 {
9972 machine_mode mode = GET_MODE (quo);
9973
9974 if (GET_MODE_INNER (mode) == HFmode)
9975 return false;
9976
9977 bool use_approx_division_p = (flag_mlow_precision_div
9978 || (aarch64_tune_params.approx_modes->division
9979 & AARCH64_APPROX_MODE (mode)));
9980
9981 if (!flag_finite_math_only
9982 || flag_trapping_math
9983 || !flag_unsafe_math_optimizations
9984 || optimize_function_for_size_p (cfun)
9985 || !use_approx_division_p)
9986 return false;
9987
9988 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9989 return false;
9990
9991 /* Estimate the approximate reciprocal. */
9992 rtx xrcp = gen_reg_rtx (mode);
9993 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9994
9995 /* Iterate over the series twice for SF and thrice for DF. */
9996 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9997
9998 /* Optionally iterate over the series once less for faster performance,
9999 while sacrificing the accuracy. */
10000 if (flag_mlow_precision_div)
10001 iterations--;
10002
10003 /* Iterate over the series to calculate the approximate reciprocal. */
10004 rtx xtmp = gen_reg_rtx (mode);
10005 while (iterations--)
10006 {
10007 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10008
10009 if (iterations > 0)
10010 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10011 }
10012
10013 if (num != CONST1_RTX (mode))
10014 {
10015 /* As the approximate reciprocal of DEN is already calculated, only
10016 calculate the approximate division when NUM is not 1.0. */
10017 rtx xnum = force_reg (mode, num);
10018 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10019 }
10020
10021 /* Finalize the approximation. */
10022 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10023 return true;
10024 }
10025
10026 /* Return the number of instructions that can be issued per cycle. */
10027 static int
10028 aarch64_sched_issue_rate (void)
10029 {
10030 return aarch64_tune_params.issue_rate;
10031 }
10032
10033 static int
10034 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10035 {
10036 int issue_rate = aarch64_sched_issue_rate ();
10037
10038 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10039 }
10040
10041
10042 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10043 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10044 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10045
10046 static int
10047 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10048 int ready_index)
10049 {
10050 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10051 }
10052
10053
10054 /* Vectorizer cost model target hooks. */
10055
10056 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10057 static int
10058 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10059 tree vectype,
10060 int misalign ATTRIBUTE_UNUSED)
10061 {
10062 unsigned elements;
10063 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10064 bool fp = false;
10065
10066 if (vectype != NULL)
10067 fp = FLOAT_TYPE_P (vectype);
10068
10069 switch (type_of_cost)
10070 {
10071 case scalar_stmt:
10072 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10073
10074 case scalar_load:
10075 return costs->scalar_load_cost;
10076
10077 case scalar_store:
10078 return costs->scalar_store_cost;
10079
10080 case vector_stmt:
10081 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10082
10083 case vector_load:
10084 return costs->vec_align_load_cost;
10085
10086 case vector_store:
10087 return costs->vec_store_cost;
10088
10089 case vec_to_scalar:
10090 return costs->vec_to_scalar_cost;
10091
10092 case scalar_to_vec:
10093 return costs->scalar_to_vec_cost;
10094
10095 case unaligned_load:
10096 case vector_gather_load:
10097 return costs->vec_unalign_load_cost;
10098
10099 case unaligned_store:
10100 case vector_scatter_store:
10101 return costs->vec_unalign_store_cost;
10102
10103 case cond_branch_taken:
10104 return costs->cond_taken_branch_cost;
10105
10106 case cond_branch_not_taken:
10107 return costs->cond_not_taken_branch_cost;
10108
10109 case vec_perm:
10110 return costs->vec_permute_cost;
10111
10112 case vec_promote_demote:
10113 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10114
10115 case vec_construct:
10116 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10117 return elements / 2 + 1;
10118
10119 default:
10120 gcc_unreachable ();
10121 }
10122 }
10123
10124 /* Implement targetm.vectorize.add_stmt_cost. */
10125 static unsigned
10126 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10127 struct _stmt_vec_info *stmt_info, int misalign,
10128 enum vect_cost_model_location where)
10129 {
10130 unsigned *cost = (unsigned *) data;
10131 unsigned retval = 0;
10132
10133 if (flag_vect_cost_model)
10134 {
10135 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10136 int stmt_cost =
10137 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10138
10139 /* Statements in an inner loop relative to the loop being
10140 vectorized are weighted more heavily. The value here is
10141 arbitrary and could potentially be improved with analysis. */
10142 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10143 count *= 50; /* FIXME */
10144
10145 retval = (unsigned) (count * stmt_cost);
10146 cost[where] += retval;
10147 }
10148
10149 return retval;
10150 }
10151
10152 static void initialize_aarch64_code_model (struct gcc_options *);
10153
10154 /* Parse the TO_PARSE string and put the architecture struct that it
10155 selects into RES and the architectural features into ISA_FLAGS.
10156 Return an aarch64_parse_opt_result describing the parse result.
10157 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10158
10159 static enum aarch64_parse_opt_result
10160 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10161 unsigned long *isa_flags)
10162 {
10163 char *ext;
10164 const struct processor *arch;
10165 char *str = (char *) alloca (strlen (to_parse) + 1);
10166 size_t len;
10167
10168 strcpy (str, to_parse);
10169
10170 ext = strchr (str, '+');
10171
10172 if (ext != NULL)
10173 len = ext - str;
10174 else
10175 len = strlen (str);
10176
10177 if (len == 0)
10178 return AARCH64_PARSE_MISSING_ARG;
10179
10180
10181 /* Loop through the list of supported ARCHes to find a match. */
10182 for (arch = all_architectures; arch->name != NULL; arch++)
10183 {
10184 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10185 {
10186 unsigned long isa_temp = arch->flags;
10187
10188 if (ext != NULL)
10189 {
10190 /* TO_PARSE string contains at least one extension. */
10191 enum aarch64_parse_opt_result ext_res
10192 = aarch64_parse_extension (ext, &isa_temp);
10193
10194 if (ext_res != AARCH64_PARSE_OK)
10195 return ext_res;
10196 }
10197 /* Extension parsing was successful. Confirm the result
10198 arch and ISA flags. */
10199 *res = arch;
10200 *isa_flags = isa_temp;
10201 return AARCH64_PARSE_OK;
10202 }
10203 }
10204
10205 /* ARCH name not found in list. */
10206 return AARCH64_PARSE_INVALID_ARG;
10207 }
10208
10209 /* Parse the TO_PARSE string and put the result tuning in RES and the
10210 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10211 describing the parse result. If there is an error parsing, RES and
10212 ISA_FLAGS are left unchanged. */
10213
10214 static enum aarch64_parse_opt_result
10215 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10216 unsigned long *isa_flags)
10217 {
10218 char *ext;
10219 const struct processor *cpu;
10220 char *str = (char *) alloca (strlen (to_parse) + 1);
10221 size_t len;
10222
10223 strcpy (str, to_parse);
10224
10225 ext = strchr (str, '+');
10226
10227 if (ext != NULL)
10228 len = ext - str;
10229 else
10230 len = strlen (str);
10231
10232 if (len == 0)
10233 return AARCH64_PARSE_MISSING_ARG;
10234
10235
10236 /* Loop through the list of supported CPUs to find a match. */
10237 for (cpu = all_cores; cpu->name != NULL; cpu++)
10238 {
10239 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10240 {
10241 unsigned long isa_temp = cpu->flags;
10242
10243
10244 if (ext != NULL)
10245 {
10246 /* TO_PARSE string contains at least one extension. */
10247 enum aarch64_parse_opt_result ext_res
10248 = aarch64_parse_extension (ext, &isa_temp);
10249
10250 if (ext_res != AARCH64_PARSE_OK)
10251 return ext_res;
10252 }
10253 /* Extension parsing was successfull. Confirm the result
10254 cpu and ISA flags. */
10255 *res = cpu;
10256 *isa_flags = isa_temp;
10257 return AARCH64_PARSE_OK;
10258 }
10259 }
10260
10261 /* CPU name not found in list. */
10262 return AARCH64_PARSE_INVALID_ARG;
10263 }
10264
10265 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10266 Return an aarch64_parse_opt_result describing the parse result.
10267 If the parsing fails the RES does not change. */
10268
10269 static enum aarch64_parse_opt_result
10270 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10271 {
10272 const struct processor *cpu;
10273 char *str = (char *) alloca (strlen (to_parse) + 1);
10274
10275 strcpy (str, to_parse);
10276
10277 /* Loop through the list of supported CPUs to find a match. */
10278 for (cpu = all_cores; cpu->name != NULL; cpu++)
10279 {
10280 if (strcmp (cpu->name, str) == 0)
10281 {
10282 *res = cpu;
10283 return AARCH64_PARSE_OK;
10284 }
10285 }
10286
10287 /* CPU name not found in list. */
10288 return AARCH64_PARSE_INVALID_ARG;
10289 }
10290
10291 /* Parse TOKEN, which has length LENGTH to see if it is an option
10292 described in FLAG. If it is, return the index bit for that fusion type.
10293 If not, error (printing OPTION_NAME) and return zero. */
10294
10295 static unsigned int
10296 aarch64_parse_one_option_token (const char *token,
10297 size_t length,
10298 const struct aarch64_flag_desc *flag,
10299 const char *option_name)
10300 {
10301 for (; flag->name != NULL; flag++)
10302 {
10303 if (length == strlen (flag->name)
10304 && !strncmp (flag->name, token, length))
10305 return flag->flag;
10306 }
10307
10308 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10309 return 0;
10310 }
10311
10312 /* Parse OPTION which is a comma-separated list of flags to enable.
10313 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10314 default state we inherit from the CPU tuning structures. OPTION_NAME
10315 gives the top-level option we are parsing in the -moverride string,
10316 for use in error messages. */
10317
10318 static unsigned int
10319 aarch64_parse_boolean_options (const char *option,
10320 const struct aarch64_flag_desc *flags,
10321 unsigned int initial_state,
10322 const char *option_name)
10323 {
10324 const char separator = '.';
10325 const char* specs = option;
10326 const char* ntoken = option;
10327 unsigned int found_flags = initial_state;
10328
10329 while ((ntoken = strchr (specs, separator)))
10330 {
10331 size_t token_length = ntoken - specs;
10332 unsigned token_ops = aarch64_parse_one_option_token (specs,
10333 token_length,
10334 flags,
10335 option_name);
10336 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10337 in the token stream, reset the supported operations. So:
10338
10339 adrp+add.cmp+branch.none.adrp+add
10340
10341 would have the result of turning on only adrp+add fusion. */
10342 if (!token_ops)
10343 found_flags = 0;
10344
10345 found_flags |= token_ops;
10346 specs = ++ntoken;
10347 }
10348
10349 /* We ended with a comma, print something. */
10350 if (!(*specs))
10351 {
10352 error ("%s string ill-formed\n", option_name);
10353 return 0;
10354 }
10355
10356 /* We still have one more token to parse. */
10357 size_t token_length = strlen (specs);
10358 unsigned token_ops = aarch64_parse_one_option_token (specs,
10359 token_length,
10360 flags,
10361 option_name);
10362 if (!token_ops)
10363 found_flags = 0;
10364
10365 found_flags |= token_ops;
10366 return found_flags;
10367 }
10368
10369 /* Support for overriding instruction fusion. */
10370
10371 static void
10372 aarch64_parse_fuse_string (const char *fuse_string,
10373 struct tune_params *tune)
10374 {
10375 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10376 aarch64_fusible_pairs,
10377 tune->fusible_ops,
10378 "fuse=");
10379 }
10380
10381 /* Support for overriding other tuning flags. */
10382
10383 static void
10384 aarch64_parse_tune_string (const char *tune_string,
10385 struct tune_params *tune)
10386 {
10387 tune->extra_tuning_flags
10388 = aarch64_parse_boolean_options (tune_string,
10389 aarch64_tuning_flags,
10390 tune->extra_tuning_flags,
10391 "tune=");
10392 }
10393
10394 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10395 we understand. If it is, extract the option string and handoff to
10396 the appropriate function. */
10397
10398 void
10399 aarch64_parse_one_override_token (const char* token,
10400 size_t length,
10401 struct tune_params *tune)
10402 {
10403 const struct aarch64_tuning_override_function *fn
10404 = aarch64_tuning_override_functions;
10405
10406 const char *option_part = strchr (token, '=');
10407 if (!option_part)
10408 {
10409 error ("tuning string missing in option (%s)", token);
10410 return;
10411 }
10412
10413 /* Get the length of the option name. */
10414 length = option_part - token;
10415 /* Skip the '=' to get to the option string. */
10416 option_part++;
10417
10418 for (; fn->name != NULL; fn++)
10419 {
10420 if (!strncmp (fn->name, token, length))
10421 {
10422 fn->parse_override (option_part, tune);
10423 return;
10424 }
10425 }
10426
10427 error ("unknown tuning option (%s)",token);
10428 return;
10429 }
10430
10431 /* A checking mechanism for the implementation of the tls size. */
10432
10433 static void
10434 initialize_aarch64_tls_size (struct gcc_options *opts)
10435 {
10436 if (aarch64_tls_size == 0)
10437 aarch64_tls_size = 24;
10438
10439 switch (opts->x_aarch64_cmodel_var)
10440 {
10441 case AARCH64_CMODEL_TINY:
10442 /* Both the default and maximum TLS size allowed under tiny is 1M which
10443 needs two instructions to address, so we clamp the size to 24. */
10444 if (aarch64_tls_size > 24)
10445 aarch64_tls_size = 24;
10446 break;
10447 case AARCH64_CMODEL_SMALL:
10448 /* The maximum TLS size allowed under small is 4G. */
10449 if (aarch64_tls_size > 32)
10450 aarch64_tls_size = 32;
10451 break;
10452 case AARCH64_CMODEL_LARGE:
10453 /* The maximum TLS size allowed under large is 16E.
10454 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10455 if (aarch64_tls_size > 48)
10456 aarch64_tls_size = 48;
10457 break;
10458 default:
10459 gcc_unreachable ();
10460 }
10461
10462 return;
10463 }
10464
10465 /* Parse STRING looking for options in the format:
10466 string :: option:string
10467 option :: name=substring
10468 name :: {a-z}
10469 substring :: defined by option. */
10470
10471 static void
10472 aarch64_parse_override_string (const char* input_string,
10473 struct tune_params* tune)
10474 {
10475 const char separator = ':';
10476 size_t string_length = strlen (input_string) + 1;
10477 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10478 char *string = string_root;
10479 strncpy (string, input_string, string_length);
10480 string[string_length - 1] = '\0';
10481
10482 char* ntoken = string;
10483
10484 while ((ntoken = strchr (string, separator)))
10485 {
10486 size_t token_length = ntoken - string;
10487 /* Make this substring look like a string. */
10488 *ntoken = '\0';
10489 aarch64_parse_one_override_token (string, token_length, tune);
10490 string = ++ntoken;
10491 }
10492
10493 /* One last option to parse. */
10494 aarch64_parse_one_override_token (string, strlen (string), tune);
10495 free (string_root);
10496 }
10497
10498
10499 static void
10500 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10501 {
10502 /* PR 70044: We have to be careful about being called multiple times for the
10503 same function. This means all changes should be repeatable. */
10504
10505 /* If the frame pointer is enabled, set it to a special value that behaves
10506 similar to frame pointer omission. If we don't do this all leaf functions
10507 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10508 If flag_omit_frame_pointer has this special value, we must force the
10509 frame pointer if not in a leaf function. We also need to force it in a
10510 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
10511 if (opts->x_flag_omit_frame_pointer == 0)
10512 opts->x_flag_omit_frame_pointer = 2;
10513
10514 /* If not optimizing for size, set the default
10515 alignment to what the target wants. */
10516 if (!opts->x_optimize_size)
10517 {
10518 if (opts->x_align_loops <= 0)
10519 opts->x_align_loops = aarch64_tune_params.loop_align;
10520 if (opts->x_align_jumps <= 0)
10521 opts->x_align_jumps = aarch64_tune_params.jump_align;
10522 if (opts->x_align_functions <= 0)
10523 opts->x_align_functions = aarch64_tune_params.function_align;
10524 }
10525
10526 /* We default to no pc-relative literal loads. */
10527
10528 aarch64_pcrelative_literal_loads = false;
10529
10530 /* If -mpc-relative-literal-loads is set on the command line, this
10531 implies that the user asked for PC relative literal loads. */
10532 if (opts->x_pcrelative_literal_loads == 1)
10533 aarch64_pcrelative_literal_loads = true;
10534
10535 /* In the tiny memory model it makes no sense to disallow PC relative
10536 literal pool loads. */
10537 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10538 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10539 aarch64_pcrelative_literal_loads = true;
10540
10541 /* When enabling the lower precision Newton series for the square root, also
10542 enable it for the reciprocal square root, since the latter is an
10543 intermediary step for the former. */
10544 if (flag_mlow_precision_sqrt)
10545 flag_mrecip_low_precision_sqrt = true;
10546 }
10547
10548 /* 'Unpack' up the internal tuning structs and update the options
10549 in OPTS. The caller must have set up selected_tune and selected_arch
10550 as all the other target-specific codegen decisions are
10551 derived from them. */
10552
10553 void
10554 aarch64_override_options_internal (struct gcc_options *opts)
10555 {
10556 aarch64_tune_flags = selected_tune->flags;
10557 aarch64_tune = selected_tune->sched_core;
10558 /* Make a copy of the tuning parameters attached to the core, which
10559 we may later overwrite. */
10560 aarch64_tune_params = *(selected_tune->tune);
10561 aarch64_architecture_version = selected_arch->architecture_version;
10562
10563 if (opts->x_aarch64_override_tune_string)
10564 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10565 &aarch64_tune_params);
10566
10567 /* This target defaults to strict volatile bitfields. */
10568 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10569 opts->x_flag_strict_volatile_bitfields = 1;
10570
10571 initialize_aarch64_code_model (opts);
10572 initialize_aarch64_tls_size (opts);
10573
10574 int queue_depth = 0;
10575 switch (aarch64_tune_params.autoprefetcher_model)
10576 {
10577 case tune_params::AUTOPREFETCHER_OFF:
10578 queue_depth = -1;
10579 break;
10580 case tune_params::AUTOPREFETCHER_WEAK:
10581 queue_depth = 0;
10582 break;
10583 case tune_params::AUTOPREFETCHER_STRONG:
10584 queue_depth = max_insn_queue_index + 1;
10585 break;
10586 default:
10587 gcc_unreachable ();
10588 }
10589
10590 /* We don't mind passing in global_options_set here as we don't use
10591 the *options_set structs anyway. */
10592 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10593 queue_depth,
10594 opts->x_param_values,
10595 global_options_set.x_param_values);
10596
10597 /* Set up parameters to be used in prefetching algorithm. Do not
10598 override the defaults unless we are tuning for a core we have
10599 researched values for. */
10600 if (aarch64_tune_params.prefetch->num_slots > 0)
10601 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10602 aarch64_tune_params.prefetch->num_slots,
10603 opts->x_param_values,
10604 global_options_set.x_param_values);
10605 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10606 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10607 aarch64_tune_params.prefetch->l1_cache_size,
10608 opts->x_param_values,
10609 global_options_set.x_param_values);
10610 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10611 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10612 aarch64_tune_params.prefetch->l1_cache_line_size,
10613 opts->x_param_values,
10614 global_options_set.x_param_values);
10615 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10616 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10617 aarch64_tune_params.prefetch->l2_cache_size,
10618 opts->x_param_values,
10619 global_options_set.x_param_values);
10620
10621 /* Use the alternative scheduling-pressure algorithm by default. */
10622 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10623 opts->x_param_values,
10624 global_options_set.x_param_values);
10625
10626 /* Enable sw prefetching at specified optimization level for
10627 CPUS that have prefetch. Lower optimization level threshold by 1
10628 when profiling is enabled. */
10629 if (opts->x_flag_prefetch_loop_arrays < 0
10630 && !opts->x_optimize_size
10631 && aarch64_tune_params.prefetch->default_opt_level >= 0
10632 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10633 opts->x_flag_prefetch_loop_arrays = 1;
10634
10635 aarch64_override_options_after_change_1 (opts);
10636 }
10637
10638 /* Print a hint with a suggestion for a core or architecture name that
10639 most closely resembles what the user passed in STR. ARCH is true if
10640 the user is asking for an architecture name. ARCH is false if the user
10641 is asking for a core name. */
10642
10643 static void
10644 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10645 {
10646 auto_vec<const char *> candidates;
10647 const struct processor *entry = arch ? all_architectures : all_cores;
10648 for (; entry->name != NULL; entry++)
10649 candidates.safe_push (entry->name);
10650
10651 #ifdef HAVE_LOCAL_CPU_DETECT
10652 /* Add also "native" as possible value. */
10653 if (arch)
10654 candidates.safe_push ("native");
10655 #endif
10656
10657 char *s;
10658 const char *hint = candidates_list_and_hint (str, s, candidates);
10659 if (hint)
10660 inform (input_location, "valid arguments are: %s;"
10661 " did you mean %qs?", s, hint);
10662 else
10663 inform (input_location, "valid arguments are: %s", s);
10664
10665 XDELETEVEC (s);
10666 }
10667
10668 /* Print a hint with a suggestion for a core name that most closely resembles
10669 what the user passed in STR. */
10670
10671 inline static void
10672 aarch64_print_hint_for_core (const char *str)
10673 {
10674 aarch64_print_hint_for_core_or_arch (str, false);
10675 }
10676
10677 /* Print a hint with a suggestion for an architecture name that most closely
10678 resembles what the user passed in STR. */
10679
10680 inline static void
10681 aarch64_print_hint_for_arch (const char *str)
10682 {
10683 aarch64_print_hint_for_core_or_arch (str, true);
10684 }
10685
10686 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10687 specified in STR and throw errors if appropriate. Put the results if
10688 they are valid in RES and ISA_FLAGS. Return whether the option is
10689 valid. */
10690
10691 static bool
10692 aarch64_validate_mcpu (const char *str, const struct processor **res,
10693 unsigned long *isa_flags)
10694 {
10695 enum aarch64_parse_opt_result parse_res
10696 = aarch64_parse_cpu (str, res, isa_flags);
10697
10698 if (parse_res == AARCH64_PARSE_OK)
10699 return true;
10700
10701 switch (parse_res)
10702 {
10703 case AARCH64_PARSE_MISSING_ARG:
10704 error ("missing cpu name in %<-mcpu=%s%>", str);
10705 break;
10706 case AARCH64_PARSE_INVALID_ARG:
10707 error ("unknown value %qs for -mcpu", str);
10708 aarch64_print_hint_for_core (str);
10709 break;
10710 case AARCH64_PARSE_INVALID_FEATURE:
10711 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10712 break;
10713 default:
10714 gcc_unreachable ();
10715 }
10716
10717 return false;
10718 }
10719
10720 /* Validate a command-line -march option. Parse the arch and extensions
10721 (if any) specified in STR and throw errors if appropriate. Put the
10722 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10723 option is valid. */
10724
10725 static bool
10726 aarch64_validate_march (const char *str, const struct processor **res,
10727 unsigned long *isa_flags)
10728 {
10729 enum aarch64_parse_opt_result parse_res
10730 = aarch64_parse_arch (str, res, isa_flags);
10731
10732 if (parse_res == AARCH64_PARSE_OK)
10733 return true;
10734
10735 switch (parse_res)
10736 {
10737 case AARCH64_PARSE_MISSING_ARG:
10738 error ("missing arch name in %<-march=%s%>", str);
10739 break;
10740 case AARCH64_PARSE_INVALID_ARG:
10741 error ("unknown value %qs for -march", str);
10742 aarch64_print_hint_for_arch (str);
10743 break;
10744 case AARCH64_PARSE_INVALID_FEATURE:
10745 error ("invalid feature modifier in %<-march=%s%>", str);
10746 break;
10747 default:
10748 gcc_unreachable ();
10749 }
10750
10751 return false;
10752 }
10753
10754 /* Validate a command-line -mtune option. Parse the cpu
10755 specified in STR and throw errors if appropriate. Put the
10756 result, if it is valid, in RES. Return whether the option is
10757 valid. */
10758
10759 static bool
10760 aarch64_validate_mtune (const char *str, const struct processor **res)
10761 {
10762 enum aarch64_parse_opt_result parse_res
10763 = aarch64_parse_tune (str, res);
10764
10765 if (parse_res == AARCH64_PARSE_OK)
10766 return true;
10767
10768 switch (parse_res)
10769 {
10770 case AARCH64_PARSE_MISSING_ARG:
10771 error ("missing cpu name in %<-mtune=%s%>", str);
10772 break;
10773 case AARCH64_PARSE_INVALID_ARG:
10774 error ("unknown value %qs for -mtune", str);
10775 aarch64_print_hint_for_core (str);
10776 break;
10777 default:
10778 gcc_unreachable ();
10779 }
10780 return false;
10781 }
10782
10783 /* Return the CPU corresponding to the enum CPU.
10784 If it doesn't specify a cpu, return the default. */
10785
10786 static const struct processor *
10787 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10788 {
10789 if (cpu != aarch64_none)
10790 return &all_cores[cpu];
10791
10792 /* The & 0x3f is to extract the bottom 6 bits that encode the
10793 default cpu as selected by the --with-cpu GCC configure option
10794 in config.gcc.
10795 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10796 flags mechanism should be reworked to make it more sane. */
10797 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10798 }
10799
10800 /* Return the architecture corresponding to the enum ARCH.
10801 If it doesn't specify a valid architecture, return the default. */
10802
10803 static const struct processor *
10804 aarch64_get_arch (enum aarch64_arch arch)
10805 {
10806 if (arch != aarch64_no_arch)
10807 return &all_architectures[arch];
10808
10809 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10810
10811 return &all_architectures[cpu->arch];
10812 }
10813
10814 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10815
10816 static poly_uint16
10817 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10818 {
10819 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10820 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10821 deciding which .md file patterns to use and when deciding whether
10822 something is a legitimate address or constant. */
10823 if (value == SVE_SCALABLE || value == SVE_128)
10824 return poly_uint16 (2, 2);
10825 else
10826 return (int) value / 64;
10827 }
10828
10829 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10830 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10831 tuning structs. In particular it must set selected_tune and
10832 aarch64_isa_flags that define the available ISA features and tuning
10833 decisions. It must also set selected_arch as this will be used to
10834 output the .arch asm tags for each function. */
10835
10836 static void
10837 aarch64_override_options (void)
10838 {
10839 unsigned long cpu_isa = 0;
10840 unsigned long arch_isa = 0;
10841 aarch64_isa_flags = 0;
10842
10843 bool valid_cpu = true;
10844 bool valid_tune = true;
10845 bool valid_arch = true;
10846
10847 selected_cpu = NULL;
10848 selected_arch = NULL;
10849 selected_tune = NULL;
10850
10851 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10852 If either of -march or -mtune is given, they override their
10853 respective component of -mcpu. */
10854 if (aarch64_cpu_string)
10855 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10856 &cpu_isa);
10857
10858 if (aarch64_arch_string)
10859 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10860 &arch_isa);
10861
10862 if (aarch64_tune_string)
10863 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10864
10865 /* If the user did not specify a processor, choose the default
10866 one for them. This will be the CPU set during configuration using
10867 --with-cpu, otherwise it is "generic". */
10868 if (!selected_cpu)
10869 {
10870 if (selected_arch)
10871 {
10872 selected_cpu = &all_cores[selected_arch->ident];
10873 aarch64_isa_flags = arch_isa;
10874 explicit_arch = selected_arch->arch;
10875 }
10876 else
10877 {
10878 /* Get default configure-time CPU. */
10879 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10880 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10881 }
10882
10883 if (selected_tune)
10884 explicit_tune_core = selected_tune->ident;
10885 }
10886 /* If both -mcpu and -march are specified check that they are architecturally
10887 compatible, warn if they're not and prefer the -march ISA flags. */
10888 else if (selected_arch)
10889 {
10890 if (selected_arch->arch != selected_cpu->arch)
10891 {
10892 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10893 all_architectures[selected_cpu->arch].name,
10894 selected_arch->name);
10895 }
10896 aarch64_isa_flags = arch_isa;
10897 explicit_arch = selected_arch->arch;
10898 explicit_tune_core = selected_tune ? selected_tune->ident
10899 : selected_cpu->ident;
10900 }
10901 else
10902 {
10903 /* -mcpu but no -march. */
10904 aarch64_isa_flags = cpu_isa;
10905 explicit_tune_core = selected_tune ? selected_tune->ident
10906 : selected_cpu->ident;
10907 gcc_assert (selected_cpu);
10908 selected_arch = &all_architectures[selected_cpu->arch];
10909 explicit_arch = selected_arch->arch;
10910 }
10911
10912 /* Set the arch as well as we will need it when outputing
10913 the .arch directive in assembly. */
10914 if (!selected_arch)
10915 {
10916 gcc_assert (selected_cpu);
10917 selected_arch = &all_architectures[selected_cpu->arch];
10918 }
10919
10920 if (!selected_tune)
10921 selected_tune = selected_cpu;
10922
10923 #ifndef HAVE_AS_MABI_OPTION
10924 /* The compiler may have been configured with 2.23.* binutils, which does
10925 not have support for ILP32. */
10926 if (TARGET_ILP32)
10927 error ("assembler does not support -mabi=ilp32");
10928 #endif
10929
10930 /* Convert -msve-vector-bits to a VG count. */
10931 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10932
10933 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10934 sorry ("return address signing is only supported for -mabi=lp64");
10935
10936 /* Make sure we properly set up the explicit options. */
10937 if ((aarch64_cpu_string && valid_cpu)
10938 || (aarch64_tune_string && valid_tune))
10939 gcc_assert (explicit_tune_core != aarch64_none);
10940
10941 if ((aarch64_cpu_string && valid_cpu)
10942 || (aarch64_arch_string && valid_arch))
10943 gcc_assert (explicit_arch != aarch64_no_arch);
10944
10945 aarch64_override_options_internal (&global_options);
10946
10947 /* Save these options as the default ones in case we push and pop them later
10948 while processing functions with potential target attributes. */
10949 target_option_default_node = target_option_current_node
10950 = build_target_option_node (&global_options);
10951 }
10952
10953 /* Implement targetm.override_options_after_change. */
10954
10955 static void
10956 aarch64_override_options_after_change (void)
10957 {
10958 aarch64_override_options_after_change_1 (&global_options);
10959 }
10960
10961 static struct machine_function *
10962 aarch64_init_machine_status (void)
10963 {
10964 struct machine_function *machine;
10965 machine = ggc_cleared_alloc<machine_function> ();
10966 return machine;
10967 }
10968
10969 void
10970 aarch64_init_expanders (void)
10971 {
10972 init_machine_status = aarch64_init_machine_status;
10973 }
10974
10975 /* A checking mechanism for the implementation of the various code models. */
10976 static void
10977 initialize_aarch64_code_model (struct gcc_options *opts)
10978 {
10979 if (opts->x_flag_pic)
10980 {
10981 switch (opts->x_aarch64_cmodel_var)
10982 {
10983 case AARCH64_CMODEL_TINY:
10984 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10985 break;
10986 case AARCH64_CMODEL_SMALL:
10987 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10988 aarch64_cmodel = (flag_pic == 2
10989 ? AARCH64_CMODEL_SMALL_PIC
10990 : AARCH64_CMODEL_SMALL_SPIC);
10991 #else
10992 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10993 #endif
10994 break;
10995 case AARCH64_CMODEL_LARGE:
10996 sorry ("code model %qs with -f%s", "large",
10997 opts->x_flag_pic > 1 ? "PIC" : "pic");
10998 break;
10999 default:
11000 gcc_unreachable ();
11001 }
11002 }
11003 else
11004 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11005 }
11006
11007 /* Implement TARGET_OPTION_SAVE. */
11008
11009 static void
11010 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11011 {
11012 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11013 }
11014
11015 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11016 using the information saved in PTR. */
11017
11018 static void
11019 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11020 {
11021 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11022 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11023 opts->x_explicit_arch = ptr->x_explicit_arch;
11024 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11025 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11026
11027 aarch64_override_options_internal (opts);
11028 }
11029
11030 /* Implement TARGET_OPTION_PRINT. */
11031
11032 static void
11033 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11034 {
11035 const struct processor *cpu
11036 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11037 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11038 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11039 std::string extension
11040 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11041
11042 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11043 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11044 arch->name, extension.c_str ());
11045 }
11046
11047 static GTY(()) tree aarch64_previous_fndecl;
11048
11049 void
11050 aarch64_reset_previous_fndecl (void)
11051 {
11052 aarch64_previous_fndecl = NULL;
11053 }
11054
11055 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11056 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11057 make sure optab availability predicates are recomputed when necessary. */
11058
11059 void
11060 aarch64_save_restore_target_globals (tree new_tree)
11061 {
11062 if (TREE_TARGET_GLOBALS (new_tree))
11063 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11064 else if (new_tree == target_option_default_node)
11065 restore_target_globals (&default_target_globals);
11066 else
11067 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11068 }
11069
11070 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11071 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11072 of the function, if such exists. This function may be called multiple
11073 times on a single function so use aarch64_previous_fndecl to avoid
11074 setting up identical state. */
11075
11076 static void
11077 aarch64_set_current_function (tree fndecl)
11078 {
11079 if (!fndecl || fndecl == aarch64_previous_fndecl)
11080 return;
11081
11082 tree old_tree = (aarch64_previous_fndecl
11083 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11084 : NULL_TREE);
11085
11086 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11087
11088 /* If current function has no attributes but the previous one did,
11089 use the default node. */
11090 if (!new_tree && old_tree)
11091 new_tree = target_option_default_node;
11092
11093 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11094 the default have been handled by aarch64_save_restore_target_globals from
11095 aarch64_pragma_target_parse. */
11096 if (old_tree == new_tree)
11097 return;
11098
11099 aarch64_previous_fndecl = fndecl;
11100
11101 /* First set the target options. */
11102 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11103
11104 aarch64_save_restore_target_globals (new_tree);
11105 }
11106
11107 /* Enum describing the various ways we can handle attributes.
11108 In many cases we can reuse the generic option handling machinery. */
11109
11110 enum aarch64_attr_opt_type
11111 {
11112 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11113 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11114 aarch64_attr_enum, /* Attribute sets an enum variable. */
11115 aarch64_attr_custom /* Attribute requires a custom handling function. */
11116 };
11117
11118 /* All the information needed to handle a target attribute.
11119 NAME is the name of the attribute.
11120 ATTR_TYPE specifies the type of behavior of the attribute as described
11121 in the definition of enum aarch64_attr_opt_type.
11122 ALLOW_NEG is true if the attribute supports a "no-" form.
11123 HANDLER is the function that takes the attribute string as an argument
11124 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11125 OPT_NUM is the enum specifying the option that the attribute modifies.
11126 This is needed for attributes that mirror the behavior of a command-line
11127 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11128 aarch64_attr_enum. */
11129
11130 struct aarch64_attribute_info
11131 {
11132 const char *name;
11133 enum aarch64_attr_opt_type attr_type;
11134 bool allow_neg;
11135 bool (*handler) (const char *);
11136 enum opt_code opt_num;
11137 };
11138
11139 /* Handle the ARCH_STR argument to the arch= target attribute. */
11140
11141 static bool
11142 aarch64_handle_attr_arch (const char *str)
11143 {
11144 const struct processor *tmp_arch = NULL;
11145 enum aarch64_parse_opt_result parse_res
11146 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11147
11148 if (parse_res == AARCH64_PARSE_OK)
11149 {
11150 gcc_assert (tmp_arch);
11151 selected_arch = tmp_arch;
11152 explicit_arch = selected_arch->arch;
11153 return true;
11154 }
11155
11156 switch (parse_res)
11157 {
11158 case AARCH64_PARSE_MISSING_ARG:
11159 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11160 break;
11161 case AARCH64_PARSE_INVALID_ARG:
11162 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11163 aarch64_print_hint_for_arch (str);
11164 break;
11165 case AARCH64_PARSE_INVALID_FEATURE:
11166 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11167 break;
11168 default:
11169 gcc_unreachable ();
11170 }
11171
11172 return false;
11173 }
11174
11175 /* Handle the argument CPU_STR to the cpu= target attribute. */
11176
11177 static bool
11178 aarch64_handle_attr_cpu (const char *str)
11179 {
11180 const struct processor *tmp_cpu = NULL;
11181 enum aarch64_parse_opt_result parse_res
11182 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11183
11184 if (parse_res == AARCH64_PARSE_OK)
11185 {
11186 gcc_assert (tmp_cpu);
11187 selected_tune = tmp_cpu;
11188 explicit_tune_core = selected_tune->ident;
11189
11190 selected_arch = &all_architectures[tmp_cpu->arch];
11191 explicit_arch = selected_arch->arch;
11192 return true;
11193 }
11194
11195 switch (parse_res)
11196 {
11197 case AARCH64_PARSE_MISSING_ARG:
11198 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11199 break;
11200 case AARCH64_PARSE_INVALID_ARG:
11201 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11202 aarch64_print_hint_for_core (str);
11203 break;
11204 case AARCH64_PARSE_INVALID_FEATURE:
11205 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11206 break;
11207 default:
11208 gcc_unreachable ();
11209 }
11210
11211 return false;
11212 }
11213
11214 /* Handle the argument STR to the tune= target attribute. */
11215
11216 static bool
11217 aarch64_handle_attr_tune (const char *str)
11218 {
11219 const struct processor *tmp_tune = NULL;
11220 enum aarch64_parse_opt_result parse_res
11221 = aarch64_parse_tune (str, &tmp_tune);
11222
11223 if (parse_res == AARCH64_PARSE_OK)
11224 {
11225 gcc_assert (tmp_tune);
11226 selected_tune = tmp_tune;
11227 explicit_tune_core = selected_tune->ident;
11228 return true;
11229 }
11230
11231 switch (parse_res)
11232 {
11233 case AARCH64_PARSE_INVALID_ARG:
11234 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11235 aarch64_print_hint_for_core (str);
11236 break;
11237 default:
11238 gcc_unreachable ();
11239 }
11240
11241 return false;
11242 }
11243
11244 /* Parse an architecture extensions target attribute string specified in STR.
11245 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11246 if successful. Update aarch64_isa_flags to reflect the ISA features
11247 modified. */
11248
11249 static bool
11250 aarch64_handle_attr_isa_flags (char *str)
11251 {
11252 enum aarch64_parse_opt_result parse_res;
11253 unsigned long isa_flags = aarch64_isa_flags;
11254
11255 /* We allow "+nothing" in the beginning to clear out all architectural
11256 features if the user wants to handpick specific features. */
11257 if (strncmp ("+nothing", str, 8) == 0)
11258 {
11259 isa_flags = 0;
11260 str += 8;
11261 }
11262
11263 parse_res = aarch64_parse_extension (str, &isa_flags);
11264
11265 if (parse_res == AARCH64_PARSE_OK)
11266 {
11267 aarch64_isa_flags = isa_flags;
11268 return true;
11269 }
11270
11271 switch (parse_res)
11272 {
11273 case AARCH64_PARSE_MISSING_ARG:
11274 error ("missing value in %<target()%> pragma or attribute");
11275 break;
11276
11277 case AARCH64_PARSE_INVALID_FEATURE:
11278 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11279 break;
11280
11281 default:
11282 gcc_unreachable ();
11283 }
11284
11285 return false;
11286 }
11287
11288 /* The target attributes that we support. On top of these we also support just
11289 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11290 handled explicitly in aarch64_process_one_target_attr. */
11291
11292 static const struct aarch64_attribute_info aarch64_attributes[] =
11293 {
11294 { "general-regs-only", aarch64_attr_mask, false, NULL,
11295 OPT_mgeneral_regs_only },
11296 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11297 OPT_mfix_cortex_a53_835769 },
11298 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11299 OPT_mfix_cortex_a53_843419 },
11300 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11301 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11302 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11303 OPT_momit_leaf_frame_pointer },
11304 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11305 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11306 OPT_march_ },
11307 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11308 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11309 OPT_mtune_ },
11310 { "sign-return-address", aarch64_attr_enum, false, NULL,
11311 OPT_msign_return_address_ },
11312 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11313 };
11314
11315 /* Parse ARG_STR which contains the definition of one target attribute.
11316 Show appropriate errors if any or return true if the attribute is valid. */
11317
11318 static bool
11319 aarch64_process_one_target_attr (char *arg_str)
11320 {
11321 bool invert = false;
11322
11323 size_t len = strlen (arg_str);
11324
11325 if (len == 0)
11326 {
11327 error ("malformed %<target()%> pragma or attribute");
11328 return false;
11329 }
11330
11331 char *str_to_check = (char *) alloca (len + 1);
11332 strcpy (str_to_check, arg_str);
11333
11334 /* Skip leading whitespace. */
11335 while (*str_to_check == ' ' || *str_to_check == '\t')
11336 str_to_check++;
11337
11338 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11339 It is easier to detect and handle it explicitly here rather than going
11340 through the machinery for the rest of the target attributes in this
11341 function. */
11342 if (*str_to_check == '+')
11343 return aarch64_handle_attr_isa_flags (str_to_check);
11344
11345 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11346 {
11347 invert = true;
11348 str_to_check += 3;
11349 }
11350 char *arg = strchr (str_to_check, '=');
11351
11352 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11353 and point ARG to "foo". */
11354 if (arg)
11355 {
11356 *arg = '\0';
11357 arg++;
11358 }
11359 const struct aarch64_attribute_info *p_attr;
11360 bool found = false;
11361 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11362 {
11363 /* If the names don't match up, or the user has given an argument
11364 to an attribute that doesn't accept one, or didn't give an argument
11365 to an attribute that expects one, fail to match. */
11366 if (strcmp (str_to_check, p_attr->name) != 0)
11367 continue;
11368
11369 found = true;
11370 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11371 || p_attr->attr_type == aarch64_attr_enum;
11372
11373 if (attr_need_arg_p ^ (arg != NULL))
11374 {
11375 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11376 return false;
11377 }
11378
11379 /* If the name matches but the attribute does not allow "no-" versions
11380 then we can't match. */
11381 if (invert && !p_attr->allow_neg)
11382 {
11383 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11384 return false;
11385 }
11386
11387 switch (p_attr->attr_type)
11388 {
11389 /* Has a custom handler registered.
11390 For example, cpu=, arch=, tune=. */
11391 case aarch64_attr_custom:
11392 gcc_assert (p_attr->handler);
11393 if (!p_attr->handler (arg))
11394 return false;
11395 break;
11396
11397 /* Either set or unset a boolean option. */
11398 case aarch64_attr_bool:
11399 {
11400 struct cl_decoded_option decoded;
11401
11402 generate_option (p_attr->opt_num, NULL, !invert,
11403 CL_TARGET, &decoded);
11404 aarch64_handle_option (&global_options, &global_options_set,
11405 &decoded, input_location);
11406 break;
11407 }
11408 /* Set or unset a bit in the target_flags. aarch64_handle_option
11409 should know what mask to apply given the option number. */
11410 case aarch64_attr_mask:
11411 {
11412 struct cl_decoded_option decoded;
11413 /* We only need to specify the option number.
11414 aarch64_handle_option will know which mask to apply. */
11415 decoded.opt_index = p_attr->opt_num;
11416 decoded.value = !invert;
11417 aarch64_handle_option (&global_options, &global_options_set,
11418 &decoded, input_location);
11419 break;
11420 }
11421 /* Use the option setting machinery to set an option to an enum. */
11422 case aarch64_attr_enum:
11423 {
11424 gcc_assert (arg);
11425 bool valid;
11426 int value;
11427 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11428 &value, CL_TARGET);
11429 if (valid)
11430 {
11431 set_option (&global_options, NULL, p_attr->opt_num, value,
11432 NULL, DK_UNSPECIFIED, input_location,
11433 global_dc);
11434 }
11435 else
11436 {
11437 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11438 }
11439 break;
11440 }
11441 default:
11442 gcc_unreachable ();
11443 }
11444 }
11445
11446 /* If we reached here we either have found an attribute and validated
11447 it or didn't match any. If we matched an attribute but its arguments
11448 were malformed we will have returned false already. */
11449 return found;
11450 }
11451
11452 /* Count how many times the character C appears in
11453 NULL-terminated string STR. */
11454
11455 static unsigned int
11456 num_occurences_in_str (char c, char *str)
11457 {
11458 unsigned int res = 0;
11459 while (*str != '\0')
11460 {
11461 if (*str == c)
11462 res++;
11463
11464 str++;
11465 }
11466
11467 return res;
11468 }
11469
11470 /* Parse the tree in ARGS that contains the target attribute information
11471 and update the global target options space. */
11472
11473 bool
11474 aarch64_process_target_attr (tree args)
11475 {
11476 if (TREE_CODE (args) == TREE_LIST)
11477 {
11478 do
11479 {
11480 tree head = TREE_VALUE (args);
11481 if (head)
11482 {
11483 if (!aarch64_process_target_attr (head))
11484 return false;
11485 }
11486 args = TREE_CHAIN (args);
11487 } while (args);
11488
11489 return true;
11490 }
11491
11492 if (TREE_CODE (args) != STRING_CST)
11493 {
11494 error ("attribute %<target%> argument not a string");
11495 return false;
11496 }
11497
11498 size_t len = strlen (TREE_STRING_POINTER (args));
11499 char *str_to_check = (char *) alloca (len + 1);
11500 strcpy (str_to_check, TREE_STRING_POINTER (args));
11501
11502 if (len == 0)
11503 {
11504 error ("malformed %<target()%> pragma or attribute");
11505 return false;
11506 }
11507
11508 /* Used to catch empty spaces between commas i.e.
11509 attribute ((target ("attr1,,attr2"))). */
11510 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11511
11512 /* Handle multiple target attributes separated by ','. */
11513 char *token = strtok (str_to_check, ",");
11514
11515 unsigned int num_attrs = 0;
11516 while (token)
11517 {
11518 num_attrs++;
11519 if (!aarch64_process_one_target_attr (token))
11520 {
11521 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11522 return false;
11523 }
11524
11525 token = strtok (NULL, ",");
11526 }
11527
11528 if (num_attrs != num_commas + 1)
11529 {
11530 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11531 return false;
11532 }
11533
11534 return true;
11535 }
11536
11537 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11538 process attribute ((target ("..."))). */
11539
11540 static bool
11541 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11542 {
11543 struct cl_target_option cur_target;
11544 bool ret;
11545 tree old_optimize;
11546 tree new_target, new_optimize;
11547 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11548
11549 /* If what we're processing is the current pragma string then the
11550 target option node is already stored in target_option_current_node
11551 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11552 having to re-parse the string. This is especially useful to keep
11553 arm_neon.h compile times down since that header contains a lot
11554 of intrinsics enclosed in pragmas. */
11555 if (!existing_target && args == current_target_pragma)
11556 {
11557 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11558 return true;
11559 }
11560 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11561
11562 old_optimize = build_optimization_node (&global_options);
11563 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11564
11565 /* If the function changed the optimization levels as well as setting
11566 target options, start with the optimizations specified. */
11567 if (func_optimize && func_optimize != old_optimize)
11568 cl_optimization_restore (&global_options,
11569 TREE_OPTIMIZATION (func_optimize));
11570
11571 /* Save the current target options to restore at the end. */
11572 cl_target_option_save (&cur_target, &global_options);
11573
11574 /* If fndecl already has some target attributes applied to it, unpack
11575 them so that we add this attribute on top of them, rather than
11576 overwriting them. */
11577 if (existing_target)
11578 {
11579 struct cl_target_option *existing_options
11580 = TREE_TARGET_OPTION (existing_target);
11581
11582 if (existing_options)
11583 cl_target_option_restore (&global_options, existing_options);
11584 }
11585 else
11586 cl_target_option_restore (&global_options,
11587 TREE_TARGET_OPTION (target_option_current_node));
11588
11589 ret = aarch64_process_target_attr (args);
11590
11591 /* Set up any additional state. */
11592 if (ret)
11593 {
11594 aarch64_override_options_internal (&global_options);
11595 /* Initialize SIMD builtins if we haven't already.
11596 Set current_target_pragma to NULL for the duration so that
11597 the builtin initialization code doesn't try to tag the functions
11598 being built with the attributes specified by any current pragma, thus
11599 going into an infinite recursion. */
11600 if (TARGET_SIMD)
11601 {
11602 tree saved_current_target_pragma = current_target_pragma;
11603 current_target_pragma = NULL;
11604 aarch64_init_simd_builtins ();
11605 current_target_pragma = saved_current_target_pragma;
11606 }
11607 new_target = build_target_option_node (&global_options);
11608 }
11609 else
11610 new_target = NULL;
11611
11612 new_optimize = build_optimization_node (&global_options);
11613
11614 if (fndecl && ret)
11615 {
11616 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11617
11618 if (old_optimize != new_optimize)
11619 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11620 }
11621
11622 cl_target_option_restore (&global_options, &cur_target);
11623
11624 if (old_optimize != new_optimize)
11625 cl_optimization_restore (&global_options,
11626 TREE_OPTIMIZATION (old_optimize));
11627 return ret;
11628 }
11629
11630 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11631 tri-bool options (yes, no, don't care) and the default value is
11632 DEF, determine whether to reject inlining. */
11633
11634 static bool
11635 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11636 int dont_care, int def)
11637 {
11638 /* If the callee doesn't care, always allow inlining. */
11639 if (callee == dont_care)
11640 return true;
11641
11642 /* If the caller doesn't care, always allow inlining. */
11643 if (caller == dont_care)
11644 return true;
11645
11646 /* Otherwise, allow inlining if either the callee and caller values
11647 agree, or if the callee is using the default value. */
11648 return (callee == caller || callee == def);
11649 }
11650
11651 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11652 to inline CALLEE into CALLER based on target-specific info.
11653 Make sure that the caller and callee have compatible architectural
11654 features. Then go through the other possible target attributes
11655 and see if they can block inlining. Try not to reject always_inline
11656 callees unless they are incompatible architecturally. */
11657
11658 static bool
11659 aarch64_can_inline_p (tree caller, tree callee)
11660 {
11661 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11662 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11663
11664 struct cl_target_option *caller_opts
11665 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11666 : target_option_default_node);
11667
11668 struct cl_target_option *callee_opts
11669 = TREE_TARGET_OPTION (callee_tree ? callee_tree
11670 : target_option_default_node);
11671
11672 /* Callee's ISA flags should be a subset of the caller's. */
11673 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11674 != callee_opts->x_aarch64_isa_flags)
11675 return false;
11676
11677 /* Allow non-strict aligned functions inlining into strict
11678 aligned ones. */
11679 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11680 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11681 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11682 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11683 return false;
11684
11685 bool always_inline = lookup_attribute ("always_inline",
11686 DECL_ATTRIBUTES (callee));
11687
11688 /* If the architectural features match up and the callee is always_inline
11689 then the other attributes don't matter. */
11690 if (always_inline)
11691 return true;
11692
11693 if (caller_opts->x_aarch64_cmodel_var
11694 != callee_opts->x_aarch64_cmodel_var)
11695 return false;
11696
11697 if (caller_opts->x_aarch64_tls_dialect
11698 != callee_opts->x_aarch64_tls_dialect)
11699 return false;
11700
11701 /* Honour explicit requests to workaround errata. */
11702 if (!aarch64_tribools_ok_for_inlining_p (
11703 caller_opts->x_aarch64_fix_a53_err835769,
11704 callee_opts->x_aarch64_fix_a53_err835769,
11705 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11706 return false;
11707
11708 if (!aarch64_tribools_ok_for_inlining_p (
11709 caller_opts->x_aarch64_fix_a53_err843419,
11710 callee_opts->x_aarch64_fix_a53_err843419,
11711 2, TARGET_FIX_ERR_A53_843419))
11712 return false;
11713
11714 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11715 caller and calle and they don't match up, reject inlining. */
11716 if (!aarch64_tribools_ok_for_inlining_p (
11717 caller_opts->x_flag_omit_leaf_frame_pointer,
11718 callee_opts->x_flag_omit_leaf_frame_pointer,
11719 2, 1))
11720 return false;
11721
11722 /* If the callee has specific tuning overrides, respect them. */
11723 if (callee_opts->x_aarch64_override_tune_string != NULL
11724 && caller_opts->x_aarch64_override_tune_string == NULL)
11725 return false;
11726
11727 /* If the user specified tuning override strings for the
11728 caller and callee and they don't match up, reject inlining.
11729 We just do a string compare here, we don't analyze the meaning
11730 of the string, as it would be too costly for little gain. */
11731 if (callee_opts->x_aarch64_override_tune_string
11732 && caller_opts->x_aarch64_override_tune_string
11733 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11734 caller_opts->x_aarch64_override_tune_string) != 0))
11735 return false;
11736
11737 return true;
11738 }
11739
11740 /* Return true if SYMBOL_REF X binds locally. */
11741
11742 static bool
11743 aarch64_symbol_binds_local_p (const_rtx x)
11744 {
11745 return (SYMBOL_REF_DECL (x)
11746 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11747 : SYMBOL_REF_LOCAL_P (x));
11748 }
11749
11750 /* Return true if SYMBOL_REF X is thread local */
11751 static bool
11752 aarch64_tls_symbol_p (rtx x)
11753 {
11754 if (! TARGET_HAVE_TLS)
11755 return false;
11756
11757 if (GET_CODE (x) != SYMBOL_REF)
11758 return false;
11759
11760 return SYMBOL_REF_TLS_MODEL (x) != 0;
11761 }
11762
11763 /* Classify a TLS symbol into one of the TLS kinds. */
11764 enum aarch64_symbol_type
11765 aarch64_classify_tls_symbol (rtx x)
11766 {
11767 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11768
11769 switch (tls_kind)
11770 {
11771 case TLS_MODEL_GLOBAL_DYNAMIC:
11772 case TLS_MODEL_LOCAL_DYNAMIC:
11773 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11774
11775 case TLS_MODEL_INITIAL_EXEC:
11776 switch (aarch64_cmodel)
11777 {
11778 case AARCH64_CMODEL_TINY:
11779 case AARCH64_CMODEL_TINY_PIC:
11780 return SYMBOL_TINY_TLSIE;
11781 default:
11782 return SYMBOL_SMALL_TLSIE;
11783 }
11784
11785 case TLS_MODEL_LOCAL_EXEC:
11786 if (aarch64_tls_size == 12)
11787 return SYMBOL_TLSLE12;
11788 else if (aarch64_tls_size == 24)
11789 return SYMBOL_TLSLE24;
11790 else if (aarch64_tls_size == 32)
11791 return SYMBOL_TLSLE32;
11792 else if (aarch64_tls_size == 48)
11793 return SYMBOL_TLSLE48;
11794 else
11795 gcc_unreachable ();
11796
11797 case TLS_MODEL_EMULATED:
11798 case TLS_MODEL_NONE:
11799 return SYMBOL_FORCE_TO_MEM;
11800
11801 default:
11802 gcc_unreachable ();
11803 }
11804 }
11805
11806 /* Return the correct method for accessing X + OFFSET, where X is either
11807 a SYMBOL_REF or LABEL_REF. */
11808
11809 enum aarch64_symbol_type
11810 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11811 {
11812 if (GET_CODE (x) == LABEL_REF)
11813 {
11814 switch (aarch64_cmodel)
11815 {
11816 case AARCH64_CMODEL_LARGE:
11817 return SYMBOL_FORCE_TO_MEM;
11818
11819 case AARCH64_CMODEL_TINY_PIC:
11820 case AARCH64_CMODEL_TINY:
11821 return SYMBOL_TINY_ABSOLUTE;
11822
11823 case AARCH64_CMODEL_SMALL_SPIC:
11824 case AARCH64_CMODEL_SMALL_PIC:
11825 case AARCH64_CMODEL_SMALL:
11826 return SYMBOL_SMALL_ABSOLUTE;
11827
11828 default:
11829 gcc_unreachable ();
11830 }
11831 }
11832
11833 if (GET_CODE (x) == SYMBOL_REF)
11834 {
11835 if (aarch64_tls_symbol_p (x))
11836 return aarch64_classify_tls_symbol (x);
11837
11838 switch (aarch64_cmodel)
11839 {
11840 case AARCH64_CMODEL_TINY:
11841 /* When we retrieve symbol + offset address, we have to make sure
11842 the offset does not cause overflow of the final address. But
11843 we have no way of knowing the address of symbol at compile time
11844 so we can't accurately say if the distance between the PC and
11845 symbol + offset is outside the addressible range of +/-1M in the
11846 TINY code model. So we rely on images not being greater than
11847 1M and cap the offset at 1M and anything beyond 1M will have to
11848 be loaded using an alternative mechanism. Furthermore if the
11849 symbol is a weak reference to something that isn't known to
11850 resolve to a symbol in this module, then force to memory. */
11851 if ((SYMBOL_REF_WEAK (x)
11852 && !aarch64_symbol_binds_local_p (x))
11853 || !IN_RANGE (offset, -1048575, 1048575))
11854 return SYMBOL_FORCE_TO_MEM;
11855 return SYMBOL_TINY_ABSOLUTE;
11856
11857 case AARCH64_CMODEL_SMALL:
11858 /* Same reasoning as the tiny code model, but the offset cap here is
11859 4G. */
11860 if ((SYMBOL_REF_WEAK (x)
11861 && !aarch64_symbol_binds_local_p (x))
11862 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11863 HOST_WIDE_INT_C (4294967264)))
11864 return SYMBOL_FORCE_TO_MEM;
11865 return SYMBOL_SMALL_ABSOLUTE;
11866
11867 case AARCH64_CMODEL_TINY_PIC:
11868 if (!aarch64_symbol_binds_local_p (x))
11869 return SYMBOL_TINY_GOT;
11870 return SYMBOL_TINY_ABSOLUTE;
11871
11872 case AARCH64_CMODEL_SMALL_SPIC:
11873 case AARCH64_CMODEL_SMALL_PIC:
11874 if (!aarch64_symbol_binds_local_p (x))
11875 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11876 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11877 return SYMBOL_SMALL_ABSOLUTE;
11878
11879 case AARCH64_CMODEL_LARGE:
11880 /* This is alright even in PIC code as the constant
11881 pool reference is always PC relative and within
11882 the same translation unit. */
11883 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11884 return SYMBOL_SMALL_ABSOLUTE;
11885 else
11886 return SYMBOL_FORCE_TO_MEM;
11887
11888 default:
11889 gcc_unreachable ();
11890 }
11891 }
11892
11893 /* By default push everything into the constant pool. */
11894 return SYMBOL_FORCE_TO_MEM;
11895 }
11896
11897 bool
11898 aarch64_constant_address_p (rtx x)
11899 {
11900 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11901 }
11902
11903 bool
11904 aarch64_legitimate_pic_operand_p (rtx x)
11905 {
11906 if (GET_CODE (x) == SYMBOL_REF
11907 || (GET_CODE (x) == CONST
11908 && GET_CODE (XEXP (x, 0)) == PLUS
11909 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11910 return false;
11911
11912 return true;
11913 }
11914
11915 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11916 that should be rematerialized rather than spilled. */
11917
11918 static bool
11919 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11920 {
11921 /* Support CSE and rematerialization of common constants. */
11922 if (CONST_INT_P (x)
11923 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11924 || GET_CODE (x) == CONST_VECTOR)
11925 return true;
11926
11927 /* Do not allow vector struct mode constants for Advanced SIMD.
11928 We could support 0 and -1 easily, but they need support in
11929 aarch64-simd.md. */
11930 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11931 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11932 return false;
11933
11934 /* Only accept variable-length vector constants if they can be
11935 handled directly.
11936
11937 ??? It would be possible to handle rematerialization of other
11938 constants via secondary reloads. */
11939 if (vec_flags & VEC_ANY_SVE)
11940 return aarch64_simd_valid_immediate (x, NULL);
11941
11942 if (GET_CODE (x) == HIGH)
11943 x = XEXP (x, 0);
11944
11945 /* Accept polynomial constants that can be calculated by using the
11946 destination of a move as the sole temporary. Constants that
11947 require a second temporary cannot be rematerialized (they can't be
11948 forced to memory and also aren't legitimate constants). */
11949 poly_int64 offset;
11950 if (poly_int_rtx_p (x, &offset))
11951 return aarch64_offset_temporaries (false, offset) <= 1;
11952
11953 /* If an offset is being added to something else, we need to allow the
11954 base to be moved into the destination register, meaning that there
11955 are no free temporaries for the offset. */
11956 x = strip_offset (x, &offset);
11957 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11958 return false;
11959
11960 /* Do not allow const (plus (anchor_symbol, const_int)). */
11961 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11962 return false;
11963
11964 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11965 so spilling them is better than rematerialization. */
11966 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11967 return true;
11968
11969 /* Label references are always constant. */
11970 if (GET_CODE (x) == LABEL_REF)
11971 return true;
11972
11973 return false;
11974 }
11975
11976 rtx
11977 aarch64_load_tp (rtx target)
11978 {
11979 if (!target
11980 || GET_MODE (target) != Pmode
11981 || !register_operand (target, Pmode))
11982 target = gen_reg_rtx (Pmode);
11983
11984 /* Can return in any reg. */
11985 emit_insn (gen_aarch64_load_tp_hard (target));
11986 return target;
11987 }
11988
11989 /* On AAPCS systems, this is the "struct __va_list". */
11990 static GTY(()) tree va_list_type;
11991
11992 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11993 Return the type to use as __builtin_va_list.
11994
11995 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11996
11997 struct __va_list
11998 {
11999 void *__stack;
12000 void *__gr_top;
12001 void *__vr_top;
12002 int __gr_offs;
12003 int __vr_offs;
12004 }; */
12005
12006 static tree
12007 aarch64_build_builtin_va_list (void)
12008 {
12009 tree va_list_name;
12010 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12011
12012 /* Create the type. */
12013 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12014 /* Give it the required name. */
12015 va_list_name = build_decl (BUILTINS_LOCATION,
12016 TYPE_DECL,
12017 get_identifier ("__va_list"),
12018 va_list_type);
12019 DECL_ARTIFICIAL (va_list_name) = 1;
12020 TYPE_NAME (va_list_type) = va_list_name;
12021 TYPE_STUB_DECL (va_list_type) = va_list_name;
12022
12023 /* Create the fields. */
12024 f_stack = build_decl (BUILTINS_LOCATION,
12025 FIELD_DECL, get_identifier ("__stack"),
12026 ptr_type_node);
12027 f_grtop = build_decl (BUILTINS_LOCATION,
12028 FIELD_DECL, get_identifier ("__gr_top"),
12029 ptr_type_node);
12030 f_vrtop = build_decl (BUILTINS_LOCATION,
12031 FIELD_DECL, get_identifier ("__vr_top"),
12032 ptr_type_node);
12033 f_groff = build_decl (BUILTINS_LOCATION,
12034 FIELD_DECL, get_identifier ("__gr_offs"),
12035 integer_type_node);
12036 f_vroff = build_decl (BUILTINS_LOCATION,
12037 FIELD_DECL, get_identifier ("__vr_offs"),
12038 integer_type_node);
12039
12040 /* Tell tree-stdarg pass about our internal offset fields.
12041 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12042 purpose to identify whether the code is updating va_list internal
12043 offset fields through irregular way. */
12044 va_list_gpr_counter_field = f_groff;
12045 va_list_fpr_counter_field = f_vroff;
12046
12047 DECL_ARTIFICIAL (f_stack) = 1;
12048 DECL_ARTIFICIAL (f_grtop) = 1;
12049 DECL_ARTIFICIAL (f_vrtop) = 1;
12050 DECL_ARTIFICIAL (f_groff) = 1;
12051 DECL_ARTIFICIAL (f_vroff) = 1;
12052
12053 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12054 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12055 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12056 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12057 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12058
12059 TYPE_FIELDS (va_list_type) = f_stack;
12060 DECL_CHAIN (f_stack) = f_grtop;
12061 DECL_CHAIN (f_grtop) = f_vrtop;
12062 DECL_CHAIN (f_vrtop) = f_groff;
12063 DECL_CHAIN (f_groff) = f_vroff;
12064
12065 /* Compute its layout. */
12066 layout_type (va_list_type);
12067
12068 return va_list_type;
12069 }
12070
12071 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12072 static void
12073 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12074 {
12075 const CUMULATIVE_ARGS *cum;
12076 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12077 tree stack, grtop, vrtop, groff, vroff;
12078 tree t;
12079 int gr_save_area_size = cfun->va_list_gpr_size;
12080 int vr_save_area_size = cfun->va_list_fpr_size;
12081 int vr_offset;
12082
12083 cum = &crtl->args.info;
12084 if (cfun->va_list_gpr_size)
12085 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12086 cfun->va_list_gpr_size);
12087 if (cfun->va_list_fpr_size)
12088 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12089 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12090
12091 if (!TARGET_FLOAT)
12092 {
12093 gcc_assert (cum->aapcs_nvrn == 0);
12094 vr_save_area_size = 0;
12095 }
12096
12097 f_stack = TYPE_FIELDS (va_list_type_node);
12098 f_grtop = DECL_CHAIN (f_stack);
12099 f_vrtop = DECL_CHAIN (f_grtop);
12100 f_groff = DECL_CHAIN (f_vrtop);
12101 f_vroff = DECL_CHAIN (f_groff);
12102
12103 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12104 NULL_TREE);
12105 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12106 NULL_TREE);
12107 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12108 NULL_TREE);
12109 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12110 NULL_TREE);
12111 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12112 NULL_TREE);
12113
12114 /* Emit code to initialize STACK, which points to the next varargs stack
12115 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12116 by named arguments. STACK is 8-byte aligned. */
12117 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12118 if (cum->aapcs_stack_size > 0)
12119 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12120 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12121 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12122
12123 /* Emit code to initialize GRTOP, the top of the GR save area.
12124 virtual_incoming_args_rtx should have been 16 byte aligned. */
12125 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12126 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12127 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12128
12129 /* Emit code to initialize VRTOP, the top of the VR save area.
12130 This address is gr_save_area_bytes below GRTOP, rounded
12131 down to the next 16-byte boundary. */
12132 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12133 vr_offset = ROUND_UP (gr_save_area_size,
12134 STACK_BOUNDARY / BITS_PER_UNIT);
12135
12136 if (vr_offset)
12137 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12138 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12139 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12140
12141 /* Emit code to initialize GROFF, the offset from GRTOP of the
12142 next GPR argument. */
12143 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12144 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12145 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12146
12147 /* Likewise emit code to initialize VROFF, the offset from FTOP
12148 of the next VR argument. */
12149 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12150 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12151 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12152 }
12153
12154 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12155
12156 static tree
12157 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12158 gimple_seq *post_p ATTRIBUTE_UNUSED)
12159 {
12160 tree addr;
12161 bool indirect_p;
12162 bool is_ha; /* is HFA or HVA. */
12163 bool dw_align; /* double-word align. */
12164 machine_mode ag_mode = VOIDmode;
12165 int nregs;
12166 machine_mode mode;
12167
12168 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12169 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12170 HOST_WIDE_INT size, rsize, adjust, align;
12171 tree t, u, cond1, cond2;
12172
12173 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12174 if (indirect_p)
12175 type = build_pointer_type (type);
12176
12177 mode = TYPE_MODE (type);
12178
12179 f_stack = TYPE_FIELDS (va_list_type_node);
12180 f_grtop = DECL_CHAIN (f_stack);
12181 f_vrtop = DECL_CHAIN (f_grtop);
12182 f_groff = DECL_CHAIN (f_vrtop);
12183 f_vroff = DECL_CHAIN (f_groff);
12184
12185 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12186 f_stack, NULL_TREE);
12187 size = int_size_in_bytes (type);
12188 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12189
12190 dw_align = false;
12191 adjust = 0;
12192 if (aarch64_vfp_is_call_or_return_candidate (mode,
12193 type,
12194 &ag_mode,
12195 &nregs,
12196 &is_ha))
12197 {
12198 /* No frontends can create types with variable-sized modes, so we
12199 shouldn't be asked to pass or return them. */
12200 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12201
12202 /* TYPE passed in fp/simd registers. */
12203 if (!TARGET_FLOAT)
12204 aarch64_err_no_fpadvsimd (mode, "varargs");
12205
12206 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12207 unshare_expr (valist), f_vrtop, NULL_TREE);
12208 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12209 unshare_expr (valist), f_vroff, NULL_TREE);
12210
12211 rsize = nregs * UNITS_PER_VREG;
12212
12213 if (is_ha)
12214 {
12215 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12216 adjust = UNITS_PER_VREG - ag_size;
12217 }
12218 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12219 && size < UNITS_PER_VREG)
12220 {
12221 adjust = UNITS_PER_VREG - size;
12222 }
12223 }
12224 else
12225 {
12226 /* TYPE passed in general registers. */
12227 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12228 unshare_expr (valist), f_grtop, NULL_TREE);
12229 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12230 unshare_expr (valist), f_groff, NULL_TREE);
12231 rsize = ROUND_UP (size, UNITS_PER_WORD);
12232 nregs = rsize / UNITS_PER_WORD;
12233
12234 if (align > 8)
12235 dw_align = true;
12236
12237 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12238 && size < UNITS_PER_WORD)
12239 {
12240 adjust = UNITS_PER_WORD - size;
12241 }
12242 }
12243
12244 /* Get a local temporary for the field value. */
12245 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12246
12247 /* Emit code to branch if off >= 0. */
12248 t = build2 (GE_EXPR, boolean_type_node, off,
12249 build_int_cst (TREE_TYPE (off), 0));
12250 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12251
12252 if (dw_align)
12253 {
12254 /* Emit: offs = (offs + 15) & -16. */
12255 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12256 build_int_cst (TREE_TYPE (off), 15));
12257 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12258 build_int_cst (TREE_TYPE (off), -16));
12259 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12260 }
12261 else
12262 roundup = NULL;
12263
12264 /* Update ap.__[g|v]r_offs */
12265 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12266 build_int_cst (TREE_TYPE (off), rsize));
12267 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12268
12269 /* String up. */
12270 if (roundup)
12271 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12272
12273 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12274 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12275 build_int_cst (TREE_TYPE (f_off), 0));
12276 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12277
12278 /* String up: make sure the assignment happens before the use. */
12279 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12280 COND_EXPR_ELSE (cond1) = t;
12281
12282 /* Prepare the trees handling the argument that is passed on the stack;
12283 the top level node will store in ON_STACK. */
12284 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12285 if (align > 8)
12286 {
12287 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12288 t = fold_build_pointer_plus_hwi (arg, 15);
12289 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12290 build_int_cst (TREE_TYPE (t), -16));
12291 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12292 }
12293 else
12294 roundup = NULL;
12295 /* Advance ap.__stack */
12296 t = fold_build_pointer_plus_hwi (arg, size + 7);
12297 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12298 build_int_cst (TREE_TYPE (t), -8));
12299 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12300 /* String up roundup and advance. */
12301 if (roundup)
12302 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12303 /* String up with arg */
12304 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12305 /* Big-endianness related address adjustment. */
12306 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12307 && size < UNITS_PER_WORD)
12308 {
12309 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12310 size_int (UNITS_PER_WORD - size));
12311 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12312 }
12313
12314 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12315 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12316
12317 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12318 t = off;
12319 if (adjust)
12320 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12321 build_int_cst (TREE_TYPE (off), adjust));
12322
12323 t = fold_convert (sizetype, t);
12324 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12325
12326 if (is_ha)
12327 {
12328 /* type ha; // treat as "struct {ftype field[n];}"
12329 ... [computing offs]
12330 for (i = 0; i <nregs; ++i, offs += 16)
12331 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12332 return ha; */
12333 int i;
12334 tree tmp_ha, field_t, field_ptr_t;
12335
12336 /* Declare a local variable. */
12337 tmp_ha = create_tmp_var_raw (type, "ha");
12338 gimple_add_tmp_var (tmp_ha);
12339
12340 /* Establish the base type. */
12341 switch (ag_mode)
12342 {
12343 case E_SFmode:
12344 field_t = float_type_node;
12345 field_ptr_t = float_ptr_type_node;
12346 break;
12347 case E_DFmode:
12348 field_t = double_type_node;
12349 field_ptr_t = double_ptr_type_node;
12350 break;
12351 case E_TFmode:
12352 field_t = long_double_type_node;
12353 field_ptr_t = long_double_ptr_type_node;
12354 break;
12355 case E_HFmode:
12356 field_t = aarch64_fp16_type_node;
12357 field_ptr_t = aarch64_fp16_ptr_type_node;
12358 break;
12359 case E_V2SImode:
12360 case E_V4SImode:
12361 {
12362 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12363 field_t = build_vector_type_for_mode (innertype, ag_mode);
12364 field_ptr_t = build_pointer_type (field_t);
12365 }
12366 break;
12367 default:
12368 gcc_assert (0);
12369 }
12370
12371 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12372 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12373 addr = t;
12374 t = fold_convert (field_ptr_t, addr);
12375 t = build2 (MODIFY_EXPR, field_t,
12376 build1 (INDIRECT_REF, field_t, tmp_ha),
12377 build1 (INDIRECT_REF, field_t, t));
12378
12379 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12380 for (i = 1; i < nregs; ++i)
12381 {
12382 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12383 u = fold_convert (field_ptr_t, addr);
12384 u = build2 (MODIFY_EXPR, field_t,
12385 build2 (MEM_REF, field_t, tmp_ha,
12386 build_int_cst (field_ptr_t,
12387 (i *
12388 int_size_in_bytes (field_t)))),
12389 build1 (INDIRECT_REF, field_t, u));
12390 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12391 }
12392
12393 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12394 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12395 }
12396
12397 COND_EXPR_ELSE (cond2) = t;
12398 addr = fold_convert (build_pointer_type (type), cond1);
12399 addr = build_va_arg_indirect_ref (addr);
12400
12401 if (indirect_p)
12402 addr = build_va_arg_indirect_ref (addr);
12403
12404 return addr;
12405 }
12406
12407 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12408
12409 static void
12410 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12411 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12412 int no_rtl)
12413 {
12414 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12415 CUMULATIVE_ARGS local_cum;
12416 int gr_saved = cfun->va_list_gpr_size;
12417 int vr_saved = cfun->va_list_fpr_size;
12418
12419 /* The caller has advanced CUM up to, but not beyond, the last named
12420 argument. Advance a local copy of CUM past the last "real" named
12421 argument, to find out how many registers are left over. */
12422 local_cum = *cum;
12423 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12424
12425 /* Found out how many registers we need to save.
12426 Honor tree-stdvar analysis results. */
12427 if (cfun->va_list_gpr_size)
12428 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12429 cfun->va_list_gpr_size / UNITS_PER_WORD);
12430 if (cfun->va_list_fpr_size)
12431 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12432 cfun->va_list_fpr_size / UNITS_PER_VREG);
12433
12434 if (!TARGET_FLOAT)
12435 {
12436 gcc_assert (local_cum.aapcs_nvrn == 0);
12437 vr_saved = 0;
12438 }
12439
12440 if (!no_rtl)
12441 {
12442 if (gr_saved > 0)
12443 {
12444 rtx ptr, mem;
12445
12446 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12447 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12448 - gr_saved * UNITS_PER_WORD);
12449 mem = gen_frame_mem (BLKmode, ptr);
12450 set_mem_alias_set (mem, get_varargs_alias_set ());
12451
12452 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12453 mem, gr_saved);
12454 }
12455 if (vr_saved > 0)
12456 {
12457 /* We can't use move_block_from_reg, because it will use
12458 the wrong mode, storing D regs only. */
12459 machine_mode mode = TImode;
12460 int off, i, vr_start;
12461
12462 /* Set OFF to the offset from virtual_incoming_args_rtx of
12463 the first vector register. The VR save area lies below
12464 the GR one, and is aligned to 16 bytes. */
12465 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12466 STACK_BOUNDARY / BITS_PER_UNIT);
12467 off -= vr_saved * UNITS_PER_VREG;
12468
12469 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12470 for (i = 0; i < vr_saved; ++i)
12471 {
12472 rtx ptr, mem;
12473
12474 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12475 mem = gen_frame_mem (mode, ptr);
12476 set_mem_alias_set (mem, get_varargs_alias_set ());
12477 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12478 off += UNITS_PER_VREG;
12479 }
12480 }
12481 }
12482
12483 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12484 any complication of having crtl->args.pretend_args_size changed. */
12485 cfun->machine->frame.saved_varargs_size
12486 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12487 STACK_BOUNDARY / BITS_PER_UNIT)
12488 + vr_saved * UNITS_PER_VREG);
12489 }
12490
12491 static void
12492 aarch64_conditional_register_usage (void)
12493 {
12494 int i;
12495 if (!TARGET_FLOAT)
12496 {
12497 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12498 {
12499 fixed_regs[i] = 1;
12500 call_used_regs[i] = 1;
12501 }
12502 }
12503 if (!TARGET_SVE)
12504 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12505 {
12506 fixed_regs[i] = 1;
12507 call_used_regs[i] = 1;
12508 }
12509 }
12510
12511 /* Walk down the type tree of TYPE counting consecutive base elements.
12512 If *MODEP is VOIDmode, then set it to the first valid floating point
12513 type. If a non-floating point type is found, or if a floating point
12514 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12515 otherwise return the count in the sub-tree. */
12516 static int
12517 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12518 {
12519 machine_mode mode;
12520 HOST_WIDE_INT size;
12521
12522 switch (TREE_CODE (type))
12523 {
12524 case REAL_TYPE:
12525 mode = TYPE_MODE (type);
12526 if (mode != DFmode && mode != SFmode
12527 && mode != TFmode && mode != HFmode)
12528 return -1;
12529
12530 if (*modep == VOIDmode)
12531 *modep = mode;
12532
12533 if (*modep == mode)
12534 return 1;
12535
12536 break;
12537
12538 case COMPLEX_TYPE:
12539 mode = TYPE_MODE (TREE_TYPE (type));
12540 if (mode != DFmode && mode != SFmode
12541 && mode != TFmode && mode != HFmode)
12542 return -1;
12543
12544 if (*modep == VOIDmode)
12545 *modep = mode;
12546
12547 if (*modep == mode)
12548 return 2;
12549
12550 break;
12551
12552 case VECTOR_TYPE:
12553 /* Use V2SImode and V4SImode as representatives of all 64-bit
12554 and 128-bit vector types. */
12555 size = int_size_in_bytes (type);
12556 switch (size)
12557 {
12558 case 8:
12559 mode = V2SImode;
12560 break;
12561 case 16:
12562 mode = V4SImode;
12563 break;
12564 default:
12565 return -1;
12566 }
12567
12568 if (*modep == VOIDmode)
12569 *modep = mode;
12570
12571 /* Vector modes are considered to be opaque: two vectors are
12572 equivalent for the purposes of being homogeneous aggregates
12573 if they are the same size. */
12574 if (*modep == mode)
12575 return 1;
12576
12577 break;
12578
12579 case ARRAY_TYPE:
12580 {
12581 int count;
12582 tree index = TYPE_DOMAIN (type);
12583
12584 /* Can't handle incomplete types nor sizes that are not
12585 fixed. */
12586 if (!COMPLETE_TYPE_P (type)
12587 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12588 return -1;
12589
12590 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12591 if (count == -1
12592 || !index
12593 || !TYPE_MAX_VALUE (index)
12594 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12595 || !TYPE_MIN_VALUE (index)
12596 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12597 || count < 0)
12598 return -1;
12599
12600 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12601 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12602
12603 /* There must be no padding. */
12604 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12605 count * GET_MODE_BITSIZE (*modep)))
12606 return -1;
12607
12608 return count;
12609 }
12610
12611 case RECORD_TYPE:
12612 {
12613 int count = 0;
12614 int sub_count;
12615 tree field;
12616
12617 /* Can't handle incomplete types nor sizes that are not
12618 fixed. */
12619 if (!COMPLETE_TYPE_P (type)
12620 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12621 return -1;
12622
12623 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12624 {
12625 if (TREE_CODE (field) != FIELD_DECL)
12626 continue;
12627
12628 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12629 if (sub_count < 0)
12630 return -1;
12631 count += sub_count;
12632 }
12633
12634 /* There must be no padding. */
12635 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12636 count * GET_MODE_BITSIZE (*modep)))
12637 return -1;
12638
12639 return count;
12640 }
12641
12642 case UNION_TYPE:
12643 case QUAL_UNION_TYPE:
12644 {
12645 /* These aren't very interesting except in a degenerate case. */
12646 int count = 0;
12647 int sub_count;
12648 tree field;
12649
12650 /* Can't handle incomplete types nor sizes that are not
12651 fixed. */
12652 if (!COMPLETE_TYPE_P (type)
12653 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12654 return -1;
12655
12656 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12657 {
12658 if (TREE_CODE (field) != FIELD_DECL)
12659 continue;
12660
12661 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12662 if (sub_count < 0)
12663 return -1;
12664 count = count > sub_count ? count : sub_count;
12665 }
12666
12667 /* There must be no padding. */
12668 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12669 count * GET_MODE_BITSIZE (*modep)))
12670 return -1;
12671
12672 return count;
12673 }
12674
12675 default:
12676 break;
12677 }
12678
12679 return -1;
12680 }
12681
12682 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12683 type as described in AAPCS64 \S 4.1.2.
12684
12685 See the comment above aarch64_composite_type_p for the notes on MODE. */
12686
12687 static bool
12688 aarch64_short_vector_p (const_tree type,
12689 machine_mode mode)
12690 {
12691 poly_int64 size = -1;
12692
12693 if (type && TREE_CODE (type) == VECTOR_TYPE)
12694 size = int_size_in_bytes (type);
12695 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12696 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12697 size = GET_MODE_SIZE (mode);
12698
12699 return known_eq (size, 8) || known_eq (size, 16);
12700 }
12701
12702 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12703 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12704 array types. The C99 floating-point complex types are also considered
12705 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12706 types, which are GCC extensions and out of the scope of AAPCS64, are
12707 treated as composite types here as well.
12708
12709 Note that MODE itself is not sufficient in determining whether a type
12710 is such a composite type or not. This is because
12711 stor-layout.c:compute_record_mode may have already changed the MODE
12712 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12713 structure with only one field may have its MODE set to the mode of the
12714 field. Also an integer mode whose size matches the size of the
12715 RECORD_TYPE type may be used to substitute the original mode
12716 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12717 solely relied on. */
12718
12719 static bool
12720 aarch64_composite_type_p (const_tree type,
12721 machine_mode mode)
12722 {
12723 if (aarch64_short_vector_p (type, mode))
12724 return false;
12725
12726 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12727 return true;
12728
12729 if (mode == BLKmode
12730 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12731 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12732 return true;
12733
12734 return false;
12735 }
12736
12737 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12738 shall be passed or returned in simd/fp register(s) (providing these
12739 parameter passing registers are available).
12740
12741 Upon successful return, *COUNT returns the number of needed registers,
12742 *BASE_MODE returns the mode of the individual register and when IS_HAF
12743 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12744 floating-point aggregate or a homogeneous short-vector aggregate. */
12745
12746 static bool
12747 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12748 const_tree type,
12749 machine_mode *base_mode,
12750 int *count,
12751 bool *is_ha)
12752 {
12753 machine_mode new_mode = VOIDmode;
12754 bool composite_p = aarch64_composite_type_p (type, mode);
12755
12756 if (is_ha != NULL) *is_ha = false;
12757
12758 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12759 || aarch64_short_vector_p (type, mode))
12760 {
12761 *count = 1;
12762 new_mode = mode;
12763 }
12764 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12765 {
12766 if (is_ha != NULL) *is_ha = true;
12767 *count = 2;
12768 new_mode = GET_MODE_INNER (mode);
12769 }
12770 else if (type && composite_p)
12771 {
12772 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12773
12774 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12775 {
12776 if (is_ha != NULL) *is_ha = true;
12777 *count = ag_count;
12778 }
12779 else
12780 return false;
12781 }
12782 else
12783 return false;
12784
12785 *base_mode = new_mode;
12786 return true;
12787 }
12788
12789 /* Implement TARGET_STRUCT_VALUE_RTX. */
12790
12791 static rtx
12792 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12793 int incoming ATTRIBUTE_UNUSED)
12794 {
12795 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12796 }
12797
12798 /* Implements target hook vector_mode_supported_p. */
12799 static bool
12800 aarch64_vector_mode_supported_p (machine_mode mode)
12801 {
12802 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12803 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12804 }
12805
12806 /* Return appropriate SIMD container
12807 for MODE within a vector of WIDTH bits. */
12808 static machine_mode
12809 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12810 {
12811 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12812 switch (mode)
12813 {
12814 case E_DFmode:
12815 return VNx2DFmode;
12816 case E_SFmode:
12817 return VNx4SFmode;
12818 case E_HFmode:
12819 return VNx8HFmode;
12820 case E_DImode:
12821 return VNx2DImode;
12822 case E_SImode:
12823 return VNx4SImode;
12824 case E_HImode:
12825 return VNx8HImode;
12826 case E_QImode:
12827 return VNx16QImode;
12828 default:
12829 return word_mode;
12830 }
12831
12832 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12833 if (TARGET_SIMD)
12834 {
12835 if (known_eq (width, 128))
12836 switch (mode)
12837 {
12838 case E_DFmode:
12839 return V2DFmode;
12840 case E_SFmode:
12841 return V4SFmode;
12842 case E_HFmode:
12843 return V8HFmode;
12844 case E_SImode:
12845 return V4SImode;
12846 case E_HImode:
12847 return V8HImode;
12848 case E_QImode:
12849 return V16QImode;
12850 case E_DImode:
12851 return V2DImode;
12852 default:
12853 break;
12854 }
12855 else
12856 switch (mode)
12857 {
12858 case E_SFmode:
12859 return V2SFmode;
12860 case E_HFmode:
12861 return V4HFmode;
12862 case E_SImode:
12863 return V2SImode;
12864 case E_HImode:
12865 return V4HImode;
12866 case E_QImode:
12867 return V8QImode;
12868 default:
12869 break;
12870 }
12871 }
12872 return word_mode;
12873 }
12874
12875 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12876 static machine_mode
12877 aarch64_preferred_simd_mode (scalar_mode mode)
12878 {
12879 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12880 return aarch64_simd_container_mode (mode, bits);
12881 }
12882
12883 /* Return a list of possible vector sizes for the vectorizer
12884 to iterate over. */
12885 static void
12886 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12887 {
12888 if (TARGET_SVE)
12889 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12890 sizes->safe_push (16);
12891 sizes->safe_push (8);
12892 }
12893
12894 /* Implement TARGET_MANGLE_TYPE. */
12895
12896 static const char *
12897 aarch64_mangle_type (const_tree type)
12898 {
12899 /* The AArch64 ABI documents say that "__va_list" has to be
12900 managled as if it is in the "std" namespace. */
12901 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12902 return "St9__va_list";
12903
12904 /* Half-precision float. */
12905 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12906 return "Dh";
12907
12908 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12909 builtin types. */
12910 if (TYPE_NAME (type) != NULL)
12911 return aarch64_mangle_builtin_type (type);
12912
12913 /* Use the default mangling. */
12914 return NULL;
12915 }
12916
12917 /* Find the first rtx_insn before insn that will generate an assembly
12918 instruction. */
12919
12920 static rtx_insn *
12921 aarch64_prev_real_insn (rtx_insn *insn)
12922 {
12923 if (!insn)
12924 return NULL;
12925
12926 do
12927 {
12928 insn = prev_real_insn (insn);
12929 }
12930 while (insn && recog_memoized (insn) < 0);
12931
12932 return insn;
12933 }
12934
12935 static bool
12936 is_madd_op (enum attr_type t1)
12937 {
12938 unsigned int i;
12939 /* A number of these may be AArch32 only. */
12940 enum attr_type mlatypes[] = {
12941 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12942 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12943 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12944 };
12945
12946 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12947 {
12948 if (t1 == mlatypes[i])
12949 return true;
12950 }
12951
12952 return false;
12953 }
12954
12955 /* Check if there is a register dependency between a load and the insn
12956 for which we hold recog_data. */
12957
12958 static bool
12959 dep_between_memop_and_curr (rtx memop)
12960 {
12961 rtx load_reg;
12962 int opno;
12963
12964 gcc_assert (GET_CODE (memop) == SET);
12965
12966 if (!REG_P (SET_DEST (memop)))
12967 return false;
12968
12969 load_reg = SET_DEST (memop);
12970 for (opno = 1; opno < recog_data.n_operands; opno++)
12971 {
12972 rtx operand = recog_data.operand[opno];
12973 if (REG_P (operand)
12974 && reg_overlap_mentioned_p (load_reg, operand))
12975 return true;
12976
12977 }
12978 return false;
12979 }
12980
12981
12982 /* When working around the Cortex-A53 erratum 835769,
12983 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12984 instruction and has a preceding memory instruction such that a NOP
12985 should be inserted between them. */
12986
12987 bool
12988 aarch64_madd_needs_nop (rtx_insn* insn)
12989 {
12990 enum attr_type attr_type;
12991 rtx_insn *prev;
12992 rtx body;
12993
12994 if (!TARGET_FIX_ERR_A53_835769)
12995 return false;
12996
12997 if (!INSN_P (insn) || recog_memoized (insn) < 0)
12998 return false;
12999
13000 attr_type = get_attr_type (insn);
13001 if (!is_madd_op (attr_type))
13002 return false;
13003
13004 prev = aarch64_prev_real_insn (insn);
13005 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13006 Restore recog state to INSN to avoid state corruption. */
13007 extract_constrain_insn_cached (insn);
13008
13009 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13010 return false;
13011
13012 body = single_set (prev);
13013
13014 /* If the previous insn is a memory op and there is no dependency between
13015 it and the DImode madd, emit a NOP between them. If body is NULL then we
13016 have a complex memory operation, probably a load/store pair.
13017 Be conservative for now and emit a NOP. */
13018 if (GET_MODE (recog_data.operand[0]) == DImode
13019 && (!body || !dep_between_memop_and_curr (body)))
13020 return true;
13021
13022 return false;
13023
13024 }
13025
13026
13027 /* Implement FINAL_PRESCAN_INSN. */
13028
13029 void
13030 aarch64_final_prescan_insn (rtx_insn *insn)
13031 {
13032 if (aarch64_madd_needs_nop (insn))
13033 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13034 }
13035
13036
13037 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13038 instruction. */
13039
13040 bool
13041 aarch64_sve_index_immediate_p (rtx base_or_step)
13042 {
13043 return (CONST_INT_P (base_or_step)
13044 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13045 }
13046
13047 /* Return true if X is a valid immediate for the SVE ADD and SUB
13048 instructions. Negate X first if NEGATE_P is true. */
13049
13050 bool
13051 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13052 {
13053 rtx elt;
13054
13055 if (!const_vec_duplicate_p (x, &elt)
13056 || !CONST_INT_P (elt))
13057 return false;
13058
13059 HOST_WIDE_INT val = INTVAL (elt);
13060 if (negate_p)
13061 val = -val;
13062 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13063
13064 if (val & 0xff)
13065 return IN_RANGE (val, 0, 0xff);
13066 return IN_RANGE (val, 0, 0xff00);
13067 }
13068
13069 /* Return true if X is a valid immediate operand for an SVE logical
13070 instruction such as AND. */
13071
13072 bool
13073 aarch64_sve_bitmask_immediate_p (rtx x)
13074 {
13075 rtx elt;
13076
13077 return (const_vec_duplicate_p (x, &elt)
13078 && CONST_INT_P (elt)
13079 && aarch64_bitmask_imm (INTVAL (elt),
13080 GET_MODE_INNER (GET_MODE (x))));
13081 }
13082
13083 /* Return true if X is a valid immediate for the SVE DUP and CPY
13084 instructions. */
13085
13086 bool
13087 aarch64_sve_dup_immediate_p (rtx x)
13088 {
13089 rtx elt;
13090
13091 if (!const_vec_duplicate_p (x, &elt)
13092 || !CONST_INT_P (elt))
13093 return false;
13094
13095 HOST_WIDE_INT val = INTVAL (elt);
13096 if (val & 0xff)
13097 return IN_RANGE (val, -0x80, 0x7f);
13098 return IN_RANGE (val, -0x8000, 0x7f00);
13099 }
13100
13101 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13102 SIGNED_P says whether the operand is signed rather than unsigned. */
13103
13104 bool
13105 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13106 {
13107 rtx elt;
13108
13109 return (const_vec_duplicate_p (x, &elt)
13110 && CONST_INT_P (elt)
13111 && (signed_p
13112 ? IN_RANGE (INTVAL (elt), -16, 15)
13113 : IN_RANGE (INTVAL (elt), 0, 127)));
13114 }
13115
13116 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13117 instruction. Negate X first if NEGATE_P is true. */
13118
13119 bool
13120 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13121 {
13122 rtx elt;
13123 REAL_VALUE_TYPE r;
13124
13125 if (!const_vec_duplicate_p (x, &elt)
13126 || GET_CODE (elt) != CONST_DOUBLE)
13127 return false;
13128
13129 r = *CONST_DOUBLE_REAL_VALUE (elt);
13130
13131 if (negate_p)
13132 r = real_value_negate (&r);
13133
13134 if (real_equal (&r, &dconst1))
13135 return true;
13136 if (real_equal (&r, &dconsthalf))
13137 return true;
13138 return false;
13139 }
13140
13141 /* Return true if X is a valid immediate operand for an SVE FMUL
13142 instruction. */
13143
13144 bool
13145 aarch64_sve_float_mul_immediate_p (rtx x)
13146 {
13147 rtx elt;
13148
13149 /* GCC will never generate a multiply with an immediate of 2, so there is no
13150 point testing for it (even though it is a valid constant). */
13151 return (const_vec_duplicate_p (x, &elt)
13152 && GET_CODE (elt) == CONST_DOUBLE
13153 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13154 }
13155
13156 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13157 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13158 is nonnull, use it to describe valid immediates. */
13159 static bool
13160 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13161 simd_immediate_info *info,
13162 enum simd_immediate_check which,
13163 simd_immediate_info::insn_type insn)
13164 {
13165 /* Try a 4-byte immediate with LSL. */
13166 for (unsigned int shift = 0; shift < 32; shift += 8)
13167 if ((val32 & (0xff << shift)) == val32)
13168 {
13169 if (info)
13170 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13171 simd_immediate_info::LSL, shift);
13172 return true;
13173 }
13174
13175 /* Try a 2-byte immediate with LSL. */
13176 unsigned int imm16 = val32 & 0xffff;
13177 if (imm16 == (val32 >> 16))
13178 for (unsigned int shift = 0; shift < 16; shift += 8)
13179 if ((imm16 & (0xff << shift)) == imm16)
13180 {
13181 if (info)
13182 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13183 simd_immediate_info::LSL, shift);
13184 return true;
13185 }
13186
13187 /* Try a 4-byte immediate with MSL, except for cases that MVN
13188 can handle. */
13189 if (which == AARCH64_CHECK_MOV)
13190 for (unsigned int shift = 8; shift < 24; shift += 8)
13191 {
13192 unsigned int low = (1 << shift) - 1;
13193 if (((val32 & (0xff << shift)) | low) == val32)
13194 {
13195 if (info)
13196 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13197 simd_immediate_info::MSL, shift);
13198 return true;
13199 }
13200 }
13201
13202 return false;
13203 }
13204
13205 /* Return true if replicating VAL64 is a valid immediate for the
13206 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13207 use it to describe valid immediates. */
13208 static bool
13209 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13210 simd_immediate_info *info,
13211 enum simd_immediate_check which)
13212 {
13213 unsigned int val32 = val64 & 0xffffffff;
13214 unsigned int val16 = val64 & 0xffff;
13215 unsigned int val8 = val64 & 0xff;
13216
13217 if (val32 == (val64 >> 32))
13218 {
13219 if ((which & AARCH64_CHECK_ORR) != 0
13220 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13221 simd_immediate_info::MOV))
13222 return true;
13223
13224 if ((which & AARCH64_CHECK_BIC) != 0
13225 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13226 simd_immediate_info::MVN))
13227 return true;
13228
13229 /* Try using a replicated byte. */
13230 if (which == AARCH64_CHECK_MOV
13231 && val16 == (val32 >> 16)
13232 && val8 == (val16 >> 8))
13233 {
13234 if (info)
13235 *info = simd_immediate_info (QImode, val8);
13236 return true;
13237 }
13238 }
13239
13240 /* Try using a bit-to-bytemask. */
13241 if (which == AARCH64_CHECK_MOV)
13242 {
13243 unsigned int i;
13244 for (i = 0; i < 64; i += 8)
13245 {
13246 unsigned char byte = (val64 >> i) & 0xff;
13247 if (byte != 0 && byte != 0xff)
13248 break;
13249 }
13250 if (i == 64)
13251 {
13252 if (info)
13253 *info = simd_immediate_info (DImode, val64);
13254 return true;
13255 }
13256 }
13257 return false;
13258 }
13259
13260 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13261 instruction. If INFO is nonnull, use it to describe valid immediates. */
13262
13263 static bool
13264 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13265 simd_immediate_info *info)
13266 {
13267 scalar_int_mode mode = DImode;
13268 unsigned int val32 = val64 & 0xffffffff;
13269 if (val32 == (val64 >> 32))
13270 {
13271 mode = SImode;
13272 unsigned int val16 = val32 & 0xffff;
13273 if (val16 == (val32 >> 16))
13274 {
13275 mode = HImode;
13276 unsigned int val8 = val16 & 0xff;
13277 if (val8 == (val16 >> 8))
13278 mode = QImode;
13279 }
13280 }
13281 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13282 if (IN_RANGE (val, -0x80, 0x7f))
13283 {
13284 /* DUP with no shift. */
13285 if (info)
13286 *info = simd_immediate_info (mode, val);
13287 return true;
13288 }
13289 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13290 {
13291 /* DUP with LSL #8. */
13292 if (info)
13293 *info = simd_immediate_info (mode, val);
13294 return true;
13295 }
13296 if (aarch64_bitmask_imm (val64, mode))
13297 {
13298 /* DUPM. */
13299 if (info)
13300 *info = simd_immediate_info (mode, val);
13301 return true;
13302 }
13303 return false;
13304 }
13305
13306 /* Return true if OP is a valid SIMD immediate for the operation
13307 described by WHICH. If INFO is nonnull, use it to describe valid
13308 immediates. */
13309 bool
13310 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13311 enum simd_immediate_check which)
13312 {
13313 machine_mode mode = GET_MODE (op);
13314 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13315 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13316 return false;
13317
13318 scalar_mode elt_mode = GET_MODE_INNER (mode);
13319 rtx base, step;
13320 unsigned int n_elts;
13321 if (GET_CODE (op) == CONST_VECTOR
13322 && CONST_VECTOR_DUPLICATE_P (op))
13323 n_elts = CONST_VECTOR_NPATTERNS (op);
13324 else if ((vec_flags & VEC_SVE_DATA)
13325 && const_vec_series_p (op, &base, &step))
13326 {
13327 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13328 if (!aarch64_sve_index_immediate_p (base)
13329 || !aarch64_sve_index_immediate_p (step))
13330 return false;
13331
13332 if (info)
13333 *info = simd_immediate_info (elt_mode, base, step);
13334 return true;
13335 }
13336 else if (GET_CODE (op) == CONST_VECTOR
13337 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13338 /* N_ELTS set above. */;
13339 else
13340 return false;
13341
13342 /* Handle PFALSE and PTRUE. */
13343 if (vec_flags & VEC_SVE_PRED)
13344 return (op == CONST0_RTX (mode)
13345 || op == CONSTM1_RTX (mode));
13346
13347 scalar_float_mode elt_float_mode;
13348 if (n_elts == 1
13349 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13350 {
13351 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13352 if (aarch64_float_const_zero_rtx_p (elt)
13353 || aarch64_float_const_representable_p (elt))
13354 {
13355 if (info)
13356 *info = simd_immediate_info (elt_float_mode, elt);
13357 return true;
13358 }
13359 }
13360
13361 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13362 if (elt_size > 8)
13363 return false;
13364
13365 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13366
13367 /* Expand the vector constant out into a byte vector, with the least
13368 significant byte of the register first. */
13369 auto_vec<unsigned char, 16> bytes;
13370 bytes.reserve (n_elts * elt_size);
13371 for (unsigned int i = 0; i < n_elts; i++)
13372 {
13373 /* The vector is provided in gcc endian-neutral fashion.
13374 For aarch64_be Advanced SIMD, it must be laid out in the vector
13375 register in reverse order. */
13376 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13377 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13378
13379 if (elt_mode != elt_int_mode)
13380 elt = gen_lowpart (elt_int_mode, elt);
13381
13382 if (!CONST_INT_P (elt))
13383 return false;
13384
13385 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13386 for (unsigned int byte = 0; byte < elt_size; byte++)
13387 {
13388 bytes.quick_push (elt_val & 0xff);
13389 elt_val >>= BITS_PER_UNIT;
13390 }
13391 }
13392
13393 /* The immediate must repeat every eight bytes. */
13394 unsigned int nbytes = bytes.length ();
13395 for (unsigned i = 8; i < nbytes; ++i)
13396 if (bytes[i] != bytes[i - 8])
13397 return false;
13398
13399 /* Get the repeating 8-byte value as an integer. No endian correction
13400 is needed here because bytes is already in lsb-first order. */
13401 unsigned HOST_WIDE_INT val64 = 0;
13402 for (unsigned int i = 0; i < 8; i++)
13403 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13404 << (i * BITS_PER_UNIT));
13405
13406 if (vec_flags & VEC_SVE_DATA)
13407 return aarch64_sve_valid_immediate (val64, info);
13408 else
13409 return aarch64_advsimd_valid_immediate (val64, info, which);
13410 }
13411
13412 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13413 has a step in the range of INDEX. Return the index expression if so,
13414 otherwise return null. */
13415 rtx
13416 aarch64_check_zero_based_sve_index_immediate (rtx x)
13417 {
13418 rtx base, step;
13419 if (const_vec_series_p (x, &base, &step)
13420 && base == const0_rtx
13421 && aarch64_sve_index_immediate_p (step))
13422 return step;
13423 return NULL_RTX;
13424 }
13425
13426 /* Check of immediate shift constants are within range. */
13427 bool
13428 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13429 {
13430 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13431 if (left)
13432 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13433 else
13434 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13435 }
13436
13437 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13438 operation of width WIDTH at bit position POS. */
13439
13440 rtx
13441 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13442 {
13443 gcc_assert (CONST_INT_P (width));
13444 gcc_assert (CONST_INT_P (pos));
13445
13446 unsigned HOST_WIDE_INT mask
13447 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13448 return GEN_INT (mask << UINTVAL (pos));
13449 }
13450
13451 bool
13452 aarch64_mov_operand_p (rtx x, machine_mode mode)
13453 {
13454 if (GET_CODE (x) == HIGH
13455 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13456 return true;
13457
13458 if (CONST_INT_P (x))
13459 return true;
13460
13461 if (VECTOR_MODE_P (GET_MODE (x)))
13462 return aarch64_simd_valid_immediate (x, NULL);
13463
13464 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13465 return true;
13466
13467 if (aarch64_sve_cnt_immediate_p (x))
13468 return true;
13469
13470 return aarch64_classify_symbolic_expression (x)
13471 == SYMBOL_TINY_ABSOLUTE;
13472 }
13473
13474 /* Return a const_int vector of VAL. */
13475 rtx
13476 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13477 {
13478 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13479 return gen_const_vec_duplicate (mode, c);
13480 }
13481
13482 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13483
13484 bool
13485 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13486 {
13487 machine_mode vmode;
13488
13489 vmode = aarch64_simd_container_mode (mode, 64);
13490 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13491 return aarch64_simd_valid_immediate (op_v, NULL);
13492 }
13493
13494 /* Construct and return a PARALLEL RTX vector with elements numbering the
13495 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13496 the vector - from the perspective of the architecture. This does not
13497 line up with GCC's perspective on lane numbers, so we end up with
13498 different masks depending on our target endian-ness. The diagram
13499 below may help. We must draw the distinction when building masks
13500 which select one half of the vector. An instruction selecting
13501 architectural low-lanes for a big-endian target, must be described using
13502 a mask selecting GCC high-lanes.
13503
13504 Big-Endian Little-Endian
13505
13506 GCC 0 1 2 3 3 2 1 0
13507 | x | x | x | x | | x | x | x | x |
13508 Architecture 3 2 1 0 3 2 1 0
13509
13510 Low Mask: { 2, 3 } { 0, 1 }
13511 High Mask: { 0, 1 } { 2, 3 }
13512
13513 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13514
13515 rtx
13516 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13517 {
13518 rtvec v = rtvec_alloc (nunits / 2);
13519 int high_base = nunits / 2;
13520 int low_base = 0;
13521 int base;
13522 rtx t1;
13523 int i;
13524
13525 if (BYTES_BIG_ENDIAN)
13526 base = high ? low_base : high_base;
13527 else
13528 base = high ? high_base : low_base;
13529
13530 for (i = 0; i < nunits / 2; i++)
13531 RTVEC_ELT (v, i) = GEN_INT (base + i);
13532
13533 t1 = gen_rtx_PARALLEL (mode, v);
13534 return t1;
13535 }
13536
13537 /* Check OP for validity as a PARALLEL RTX vector with elements
13538 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13539 from the perspective of the architecture. See the diagram above
13540 aarch64_simd_vect_par_cnst_half for more details. */
13541
13542 bool
13543 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13544 bool high)
13545 {
13546 int nelts;
13547 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13548 return false;
13549
13550 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13551 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13552 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13553 int i = 0;
13554
13555 if (count_op != count_ideal)
13556 return false;
13557
13558 for (i = 0; i < count_ideal; i++)
13559 {
13560 rtx elt_op = XVECEXP (op, 0, i);
13561 rtx elt_ideal = XVECEXP (ideal, 0, i);
13562
13563 if (!CONST_INT_P (elt_op)
13564 || INTVAL (elt_ideal) != INTVAL (elt_op))
13565 return false;
13566 }
13567 return true;
13568 }
13569
13570 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13571 HIGH (exclusive). */
13572 void
13573 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13574 const_tree exp)
13575 {
13576 HOST_WIDE_INT lane;
13577 gcc_assert (CONST_INT_P (operand));
13578 lane = INTVAL (operand);
13579
13580 if (lane < low || lane >= high)
13581 {
13582 if (exp)
13583 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13584 else
13585 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13586 }
13587 }
13588
13589 /* Peform endian correction on lane number N, which indexes a vector
13590 of mode MODE, and return the result as an SImode rtx. */
13591
13592 rtx
13593 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13594 {
13595 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13596 }
13597
13598 /* Return TRUE if OP is a valid vector addressing mode. */
13599
13600 bool
13601 aarch64_simd_mem_operand_p (rtx op)
13602 {
13603 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13604 || REG_P (XEXP (op, 0)));
13605 }
13606
13607 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13608
13609 bool
13610 aarch64_sve_ld1r_operand_p (rtx op)
13611 {
13612 struct aarch64_address_info addr;
13613 scalar_mode mode;
13614
13615 return (MEM_P (op)
13616 && is_a <scalar_mode> (GET_MODE (op), &mode)
13617 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13618 && addr.type == ADDRESS_REG_IMM
13619 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13620 }
13621
13622 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13623 The conditions for STR are the same. */
13624 bool
13625 aarch64_sve_ldr_operand_p (rtx op)
13626 {
13627 struct aarch64_address_info addr;
13628
13629 return (MEM_P (op)
13630 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13631 false, ADDR_QUERY_ANY)
13632 && addr.type == ADDRESS_REG_IMM);
13633 }
13634
13635 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13636 We need to be able to access the individual pieces, so the range
13637 is different from LD[234] and ST[234]. */
13638 bool
13639 aarch64_sve_struct_memory_operand_p (rtx op)
13640 {
13641 if (!MEM_P (op))
13642 return false;
13643
13644 machine_mode mode = GET_MODE (op);
13645 struct aarch64_address_info addr;
13646 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13647 ADDR_QUERY_ANY)
13648 || addr.type != ADDRESS_REG_IMM)
13649 return false;
13650
13651 poly_int64 first = addr.const_offset;
13652 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13653 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13654 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13655 }
13656
13657 /* Emit a register copy from operand to operand, taking care not to
13658 early-clobber source registers in the process.
13659
13660 COUNT is the number of components into which the copy needs to be
13661 decomposed. */
13662 void
13663 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13664 unsigned int count)
13665 {
13666 unsigned int i;
13667 int rdest = REGNO (operands[0]);
13668 int rsrc = REGNO (operands[1]);
13669
13670 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13671 || rdest < rsrc)
13672 for (i = 0; i < count; i++)
13673 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13674 gen_rtx_REG (mode, rsrc + i));
13675 else
13676 for (i = 0; i < count; i++)
13677 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13678 gen_rtx_REG (mode, rsrc + count - i - 1));
13679 }
13680
13681 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13682 one of VSTRUCT modes: OI, CI, or XI. */
13683 int
13684 aarch64_simd_attr_length_rglist (machine_mode mode)
13685 {
13686 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13687 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13688 }
13689
13690 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13691 alignment of a vector to 128 bits. SVE predicates have an alignment of
13692 16 bits. */
13693 static HOST_WIDE_INT
13694 aarch64_simd_vector_alignment (const_tree type)
13695 {
13696 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13697 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13698 be set for non-predicate vectors of booleans. Modes are the most
13699 direct way we have of identifying real SVE predicate types. */
13700 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13701 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13702 return MIN (align, 128);
13703 }
13704
13705 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13706 static HOST_WIDE_INT
13707 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13708 {
13709 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13710 {
13711 /* If the length of the vector is fixed, try to align to that length,
13712 otherwise don't try to align at all. */
13713 HOST_WIDE_INT result;
13714 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13715 result = TYPE_ALIGN (TREE_TYPE (type));
13716 return result;
13717 }
13718 return TYPE_ALIGN (type);
13719 }
13720
13721 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13722 static bool
13723 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13724 {
13725 if (is_packed)
13726 return false;
13727
13728 /* For fixed-length vectors, check that the vectorizer will aim for
13729 full-vector alignment. This isn't true for generic GCC vectors
13730 that are wider than the ABI maximum of 128 bits. */
13731 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13732 && (wi::to_widest (TYPE_SIZE (type))
13733 != aarch64_vectorize_preferred_vector_alignment (type)))
13734 return false;
13735
13736 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13737 return true;
13738 }
13739
13740 /* Return true if the vector misalignment factor is supported by the
13741 target. */
13742 static bool
13743 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13744 const_tree type, int misalignment,
13745 bool is_packed)
13746 {
13747 if (TARGET_SIMD && STRICT_ALIGNMENT)
13748 {
13749 /* Return if movmisalign pattern is not supported for this mode. */
13750 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13751 return false;
13752
13753 /* Misalignment factor is unknown at compile time. */
13754 if (misalignment == -1)
13755 return false;
13756 }
13757 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13758 is_packed);
13759 }
13760
13761 /* If VALS is a vector constant that can be loaded into a register
13762 using DUP, generate instructions to do so and return an RTX to
13763 assign to the register. Otherwise return NULL_RTX. */
13764 static rtx
13765 aarch64_simd_dup_constant (rtx vals)
13766 {
13767 machine_mode mode = GET_MODE (vals);
13768 machine_mode inner_mode = GET_MODE_INNER (mode);
13769 rtx x;
13770
13771 if (!const_vec_duplicate_p (vals, &x))
13772 return NULL_RTX;
13773
13774 /* We can load this constant by using DUP and a constant in a
13775 single ARM register. This will be cheaper than a vector
13776 load. */
13777 x = copy_to_mode_reg (inner_mode, x);
13778 return gen_vec_duplicate (mode, x);
13779 }
13780
13781
13782 /* Generate code to load VALS, which is a PARALLEL containing only
13783 constants (for vec_init) or CONST_VECTOR, efficiently into a
13784 register. Returns an RTX to copy into the register, or NULL_RTX
13785 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13786 static rtx
13787 aarch64_simd_make_constant (rtx vals)
13788 {
13789 machine_mode mode = GET_MODE (vals);
13790 rtx const_dup;
13791 rtx const_vec = NULL_RTX;
13792 int n_const = 0;
13793 int i;
13794
13795 if (GET_CODE (vals) == CONST_VECTOR)
13796 const_vec = vals;
13797 else if (GET_CODE (vals) == PARALLEL)
13798 {
13799 /* A CONST_VECTOR must contain only CONST_INTs and
13800 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13801 Only store valid constants in a CONST_VECTOR. */
13802 int n_elts = XVECLEN (vals, 0);
13803 for (i = 0; i < n_elts; ++i)
13804 {
13805 rtx x = XVECEXP (vals, 0, i);
13806 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13807 n_const++;
13808 }
13809 if (n_const == n_elts)
13810 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13811 }
13812 else
13813 gcc_unreachable ();
13814
13815 if (const_vec != NULL_RTX
13816 && aarch64_simd_valid_immediate (const_vec, NULL))
13817 /* Load using MOVI/MVNI. */
13818 return const_vec;
13819 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13820 /* Loaded using DUP. */
13821 return const_dup;
13822 else if (const_vec != NULL_RTX)
13823 /* Load from constant pool. We can not take advantage of single-cycle
13824 LD1 because we need a PC-relative addressing mode. */
13825 return const_vec;
13826 else
13827 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13828 We can not construct an initializer. */
13829 return NULL_RTX;
13830 }
13831
13832 /* Expand a vector initialisation sequence, such that TARGET is
13833 initialised to contain VALS. */
13834
13835 void
13836 aarch64_expand_vector_init (rtx target, rtx vals)
13837 {
13838 machine_mode mode = GET_MODE (target);
13839 scalar_mode inner_mode = GET_MODE_INNER (mode);
13840 /* The number of vector elements. */
13841 int n_elts = XVECLEN (vals, 0);
13842 /* The number of vector elements which are not constant. */
13843 int n_var = 0;
13844 rtx any_const = NULL_RTX;
13845 /* The first element of vals. */
13846 rtx v0 = XVECEXP (vals, 0, 0);
13847 bool all_same = true;
13848
13849 /* Count the number of variable elements to initialise. */
13850 for (int i = 0; i < n_elts; ++i)
13851 {
13852 rtx x = XVECEXP (vals, 0, i);
13853 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13854 ++n_var;
13855 else
13856 any_const = x;
13857
13858 all_same &= rtx_equal_p (x, v0);
13859 }
13860
13861 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13862 how best to handle this. */
13863 if (n_var == 0)
13864 {
13865 rtx constant = aarch64_simd_make_constant (vals);
13866 if (constant != NULL_RTX)
13867 {
13868 emit_move_insn (target, constant);
13869 return;
13870 }
13871 }
13872
13873 /* Splat a single non-constant element if we can. */
13874 if (all_same)
13875 {
13876 rtx x = copy_to_mode_reg (inner_mode, v0);
13877 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13878 return;
13879 }
13880
13881 enum insn_code icode = optab_handler (vec_set_optab, mode);
13882 gcc_assert (icode != CODE_FOR_nothing);
13883
13884 /* If there are only variable elements, try to optimize
13885 the insertion using dup for the most common element
13886 followed by insertions. */
13887
13888 /* The algorithm will fill matches[*][0] with the earliest matching element,
13889 and matches[X][1] with the count of duplicate elements (if X is the
13890 earliest element which has duplicates). */
13891
13892 if (n_var == n_elts && n_elts <= 16)
13893 {
13894 int matches[16][2] = {0};
13895 for (int i = 0; i < n_elts; i++)
13896 {
13897 for (int j = 0; j <= i; j++)
13898 {
13899 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13900 {
13901 matches[i][0] = j;
13902 matches[j][1]++;
13903 break;
13904 }
13905 }
13906 }
13907 int maxelement = 0;
13908 int maxv = 0;
13909 for (int i = 0; i < n_elts; i++)
13910 if (matches[i][1] > maxv)
13911 {
13912 maxelement = i;
13913 maxv = matches[i][1];
13914 }
13915
13916 /* Create a duplicate of the most common element, unless all elements
13917 are equally useless to us, in which case just immediately set the
13918 vector register using the first element. */
13919
13920 if (maxv == 1)
13921 {
13922 /* For vectors of two 64-bit elements, we can do even better. */
13923 if (n_elts == 2
13924 && (inner_mode == E_DImode
13925 || inner_mode == E_DFmode))
13926
13927 {
13928 rtx x0 = XVECEXP (vals, 0, 0);
13929 rtx x1 = XVECEXP (vals, 0, 1);
13930 /* Combine can pick up this case, but handling it directly
13931 here leaves clearer RTL.
13932
13933 This is load_pair_lanes<mode>, and also gives us a clean-up
13934 for store_pair_lanes<mode>. */
13935 if (memory_operand (x0, inner_mode)
13936 && memory_operand (x1, inner_mode)
13937 && !STRICT_ALIGNMENT
13938 && rtx_equal_p (XEXP (x1, 0),
13939 plus_constant (Pmode,
13940 XEXP (x0, 0),
13941 GET_MODE_SIZE (inner_mode))))
13942 {
13943 rtx t;
13944 if (inner_mode == DFmode)
13945 t = gen_load_pair_lanesdf (target, x0, x1);
13946 else
13947 t = gen_load_pair_lanesdi (target, x0, x1);
13948 emit_insn (t);
13949 return;
13950 }
13951 }
13952 /* The subreg-move sequence below will move into lane zero of the
13953 vector register. For big-endian we want that position to hold
13954 the last element of VALS. */
13955 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13956 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13957 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13958 }
13959 else
13960 {
13961 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13962 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13963 }
13964
13965 /* Insert the rest. */
13966 for (int i = 0; i < n_elts; i++)
13967 {
13968 rtx x = XVECEXP (vals, 0, i);
13969 if (matches[i][0] == maxelement)
13970 continue;
13971 x = copy_to_mode_reg (inner_mode, x);
13972 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13973 }
13974 return;
13975 }
13976
13977 /* Initialise a vector which is part-variable. We want to first try
13978 to build those lanes which are constant in the most efficient way we
13979 can. */
13980 if (n_var != n_elts)
13981 {
13982 rtx copy = copy_rtx (vals);
13983
13984 /* Load constant part of vector. We really don't care what goes into the
13985 parts we will overwrite, but we're more likely to be able to load the
13986 constant efficiently if it has fewer, larger, repeating parts
13987 (see aarch64_simd_valid_immediate). */
13988 for (int i = 0; i < n_elts; i++)
13989 {
13990 rtx x = XVECEXP (vals, 0, i);
13991 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13992 continue;
13993 rtx subst = any_const;
13994 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13995 {
13996 /* Look in the copied vector, as more elements are const. */
13997 rtx test = XVECEXP (copy, 0, i ^ bit);
13998 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13999 {
14000 subst = test;
14001 break;
14002 }
14003 }
14004 XVECEXP (copy, 0, i) = subst;
14005 }
14006 aarch64_expand_vector_init (target, copy);
14007 }
14008
14009 /* Insert the variable lanes directly. */
14010 for (int i = 0; i < n_elts; i++)
14011 {
14012 rtx x = XVECEXP (vals, 0, i);
14013 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14014 continue;
14015 x = copy_to_mode_reg (inner_mode, x);
14016 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14017 }
14018 }
14019
14020 static unsigned HOST_WIDE_INT
14021 aarch64_shift_truncation_mask (machine_mode mode)
14022 {
14023 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14024 return 0;
14025 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14026 }
14027
14028 /* Select a format to encode pointers in exception handling data. */
14029 int
14030 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14031 {
14032 int type;
14033 switch (aarch64_cmodel)
14034 {
14035 case AARCH64_CMODEL_TINY:
14036 case AARCH64_CMODEL_TINY_PIC:
14037 case AARCH64_CMODEL_SMALL:
14038 case AARCH64_CMODEL_SMALL_PIC:
14039 case AARCH64_CMODEL_SMALL_SPIC:
14040 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14041 for everything. */
14042 type = DW_EH_PE_sdata4;
14043 break;
14044 default:
14045 /* No assumptions here. 8-byte relocs required. */
14046 type = DW_EH_PE_sdata8;
14047 break;
14048 }
14049 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14050 }
14051
14052 /* The last .arch and .tune assembly strings that we printed. */
14053 static std::string aarch64_last_printed_arch_string;
14054 static std::string aarch64_last_printed_tune_string;
14055
14056 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14057 by the function fndecl. */
14058
14059 void
14060 aarch64_declare_function_name (FILE *stream, const char* name,
14061 tree fndecl)
14062 {
14063 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14064
14065 struct cl_target_option *targ_options;
14066 if (target_parts)
14067 targ_options = TREE_TARGET_OPTION (target_parts);
14068 else
14069 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14070 gcc_assert (targ_options);
14071
14072 const struct processor *this_arch
14073 = aarch64_get_arch (targ_options->x_explicit_arch);
14074
14075 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14076 std::string extension
14077 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14078 this_arch->flags);
14079 /* Only update the assembler .arch string if it is distinct from the last
14080 such string we printed. */
14081 std::string to_print = this_arch->name + extension;
14082 if (to_print != aarch64_last_printed_arch_string)
14083 {
14084 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14085 aarch64_last_printed_arch_string = to_print;
14086 }
14087
14088 /* Print the cpu name we're tuning for in the comments, might be
14089 useful to readers of the generated asm. Do it only when it changes
14090 from function to function and verbose assembly is requested. */
14091 const struct processor *this_tune
14092 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14093
14094 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14095 {
14096 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14097 this_tune->name);
14098 aarch64_last_printed_tune_string = this_tune->name;
14099 }
14100
14101 /* Don't forget the type directive for ELF. */
14102 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14103 ASM_OUTPUT_LABEL (stream, name);
14104 }
14105
14106 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14107
14108 static void
14109 aarch64_start_file (void)
14110 {
14111 struct cl_target_option *default_options
14112 = TREE_TARGET_OPTION (target_option_default_node);
14113
14114 const struct processor *default_arch
14115 = aarch64_get_arch (default_options->x_explicit_arch);
14116 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14117 std::string extension
14118 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14119 default_arch->flags);
14120
14121 aarch64_last_printed_arch_string = default_arch->name + extension;
14122 aarch64_last_printed_tune_string = "";
14123 asm_fprintf (asm_out_file, "\t.arch %s\n",
14124 aarch64_last_printed_arch_string.c_str ());
14125
14126 default_file_start ();
14127 }
14128
14129 /* Emit load exclusive. */
14130
14131 static void
14132 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14133 rtx mem, rtx model_rtx)
14134 {
14135 rtx (*gen) (rtx, rtx, rtx);
14136
14137 switch (mode)
14138 {
14139 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14140 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14141 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14142 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14143 default:
14144 gcc_unreachable ();
14145 }
14146
14147 emit_insn (gen (rval, mem, model_rtx));
14148 }
14149
14150 /* Emit store exclusive. */
14151
14152 static void
14153 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14154 rtx rval, rtx mem, rtx model_rtx)
14155 {
14156 rtx (*gen) (rtx, rtx, rtx, rtx);
14157
14158 switch (mode)
14159 {
14160 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14161 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14162 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14163 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14164 default:
14165 gcc_unreachable ();
14166 }
14167
14168 emit_insn (gen (bval, rval, mem, model_rtx));
14169 }
14170
14171 /* Mark the previous jump instruction as unlikely. */
14172
14173 static void
14174 aarch64_emit_unlikely_jump (rtx insn)
14175 {
14176 rtx_insn *jump = emit_jump_insn (insn);
14177 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14178 }
14179
14180 /* Expand a compare and swap pattern. */
14181
14182 void
14183 aarch64_expand_compare_and_swap (rtx operands[])
14184 {
14185 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14186 machine_mode mode, cmp_mode;
14187 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14188 int idx;
14189 gen_cas_fn gen;
14190 const gen_cas_fn split_cas[] =
14191 {
14192 gen_aarch64_compare_and_swapqi,
14193 gen_aarch64_compare_and_swaphi,
14194 gen_aarch64_compare_and_swapsi,
14195 gen_aarch64_compare_and_swapdi
14196 };
14197 const gen_cas_fn atomic_cas[] =
14198 {
14199 gen_aarch64_compare_and_swapqi_lse,
14200 gen_aarch64_compare_and_swaphi_lse,
14201 gen_aarch64_compare_and_swapsi_lse,
14202 gen_aarch64_compare_and_swapdi_lse
14203 };
14204
14205 bval = operands[0];
14206 rval = operands[1];
14207 mem = operands[2];
14208 oldval = operands[3];
14209 newval = operands[4];
14210 is_weak = operands[5];
14211 mod_s = operands[6];
14212 mod_f = operands[7];
14213 mode = GET_MODE (mem);
14214 cmp_mode = mode;
14215
14216 /* Normally the succ memory model must be stronger than fail, but in the
14217 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14218 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14219
14220 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14221 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14222 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14223
14224 switch (mode)
14225 {
14226 case E_QImode:
14227 case E_HImode:
14228 /* For short modes, we're going to perform the comparison in SImode,
14229 so do the zero-extension now. */
14230 cmp_mode = SImode;
14231 rval = gen_reg_rtx (SImode);
14232 oldval = convert_modes (SImode, mode, oldval, true);
14233 /* Fall through. */
14234
14235 case E_SImode:
14236 case E_DImode:
14237 /* Force the value into a register if needed. */
14238 if (!aarch64_plus_operand (oldval, mode))
14239 oldval = force_reg (cmp_mode, oldval);
14240 break;
14241
14242 default:
14243 gcc_unreachable ();
14244 }
14245
14246 switch (mode)
14247 {
14248 case E_QImode: idx = 0; break;
14249 case E_HImode: idx = 1; break;
14250 case E_SImode: idx = 2; break;
14251 case E_DImode: idx = 3; break;
14252 default:
14253 gcc_unreachable ();
14254 }
14255 if (TARGET_LSE)
14256 gen = atomic_cas[idx];
14257 else
14258 gen = split_cas[idx];
14259
14260 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14261
14262 if (mode == QImode || mode == HImode)
14263 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14264
14265 x = gen_rtx_REG (CCmode, CC_REGNUM);
14266 x = gen_rtx_EQ (SImode, x, const0_rtx);
14267 emit_insn (gen_rtx_SET (bval, x));
14268 }
14269
14270 /* Test whether the target supports using a atomic load-operate instruction.
14271 CODE is the operation and AFTER is TRUE if the data in memory after the
14272 operation should be returned and FALSE if the data before the operation
14273 should be returned. Returns FALSE if the operation isn't supported by the
14274 architecture. */
14275
14276 bool
14277 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14278 {
14279 if (!TARGET_LSE)
14280 return false;
14281
14282 switch (code)
14283 {
14284 case SET:
14285 case AND:
14286 case IOR:
14287 case XOR:
14288 case MINUS:
14289 case PLUS:
14290 return true;
14291 default:
14292 return false;
14293 }
14294 }
14295
14296 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14297 sequence implementing an atomic operation. */
14298
14299 static void
14300 aarch64_emit_post_barrier (enum memmodel model)
14301 {
14302 const enum memmodel base_model = memmodel_base (model);
14303
14304 if (is_mm_sync (model)
14305 && (base_model == MEMMODEL_ACQUIRE
14306 || base_model == MEMMODEL_ACQ_REL
14307 || base_model == MEMMODEL_SEQ_CST))
14308 {
14309 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14310 }
14311 }
14312
14313 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14314 for the data in memory. EXPECTED is the value expected to be in memory.
14315 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14316 is the memory ordering to use. */
14317
14318 void
14319 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14320 rtx expected, rtx desired,
14321 rtx model)
14322 {
14323 rtx (*gen) (rtx, rtx, rtx, rtx);
14324 machine_mode mode;
14325
14326 mode = GET_MODE (mem);
14327
14328 switch (mode)
14329 {
14330 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14331 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14332 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14333 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14334 default:
14335 gcc_unreachable ();
14336 }
14337
14338 /* Move the expected value into the CAS destination register. */
14339 emit_insn (gen_rtx_SET (rval, expected));
14340
14341 /* Emit the CAS. */
14342 emit_insn (gen (rval, mem, desired, model));
14343
14344 /* Compare the expected value with the value loaded by the CAS, to establish
14345 whether the swap was made. */
14346 aarch64_gen_compare_reg (EQ, rval, expected);
14347 }
14348
14349 /* Split a compare and swap pattern. */
14350
14351 void
14352 aarch64_split_compare_and_swap (rtx operands[])
14353 {
14354 rtx rval, mem, oldval, newval, scratch;
14355 machine_mode mode;
14356 bool is_weak;
14357 rtx_code_label *label1, *label2;
14358 rtx x, cond;
14359 enum memmodel model;
14360 rtx model_rtx;
14361
14362 rval = operands[0];
14363 mem = operands[1];
14364 oldval = operands[2];
14365 newval = operands[3];
14366 is_weak = (operands[4] != const0_rtx);
14367 model_rtx = operands[5];
14368 scratch = operands[7];
14369 mode = GET_MODE (mem);
14370 model = memmodel_from_int (INTVAL (model_rtx));
14371
14372 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14373 loop:
14374 .label1:
14375 LD[A]XR rval, [mem]
14376 CBNZ rval, .label2
14377 ST[L]XR scratch, newval, [mem]
14378 CBNZ scratch, .label1
14379 .label2:
14380 CMP rval, 0. */
14381 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14382
14383 label1 = NULL;
14384 if (!is_weak)
14385 {
14386 label1 = gen_label_rtx ();
14387 emit_label (label1);
14388 }
14389 label2 = gen_label_rtx ();
14390
14391 /* The initial load can be relaxed for a __sync operation since a final
14392 barrier will be emitted to stop code hoisting. */
14393 if (is_mm_sync (model))
14394 aarch64_emit_load_exclusive (mode, rval, mem,
14395 GEN_INT (MEMMODEL_RELAXED));
14396 else
14397 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14398
14399 if (strong_zero_p)
14400 {
14401 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14402 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14403 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14404 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14405 }
14406 else
14407 {
14408 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14409 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14410 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14411 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14412 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14413 }
14414
14415 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14416
14417 if (!is_weak)
14418 {
14419 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14420 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14421 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14422 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14423 }
14424 else
14425 {
14426 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14427 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14428 emit_insn (gen_rtx_SET (cond, x));
14429 }
14430
14431 emit_label (label2);
14432 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14433 to set the condition flags. If this is not used it will be removed by
14434 later passes. */
14435 if (strong_zero_p)
14436 {
14437 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14438 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14439 emit_insn (gen_rtx_SET (cond, x));
14440 }
14441 /* Emit any final barrier needed for a __sync operation. */
14442 if (is_mm_sync (model))
14443 aarch64_emit_post_barrier (model);
14444 }
14445
14446 /* Emit a BIC instruction. */
14447
14448 static void
14449 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14450 {
14451 rtx shift_rtx = GEN_INT (shift);
14452 rtx (*gen) (rtx, rtx, rtx, rtx);
14453
14454 switch (mode)
14455 {
14456 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14457 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14458 default:
14459 gcc_unreachable ();
14460 }
14461
14462 emit_insn (gen (dst, s2, shift_rtx, s1));
14463 }
14464
14465 /* Emit an atomic swap. */
14466
14467 static void
14468 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14469 rtx mem, rtx model)
14470 {
14471 rtx (*gen) (rtx, rtx, rtx, rtx);
14472
14473 switch (mode)
14474 {
14475 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14476 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14477 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14478 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14479 default:
14480 gcc_unreachable ();
14481 }
14482
14483 emit_insn (gen (dst, mem, value, model));
14484 }
14485
14486 /* Operations supported by aarch64_emit_atomic_load_op. */
14487
14488 enum aarch64_atomic_load_op_code
14489 {
14490 AARCH64_LDOP_PLUS, /* A + B */
14491 AARCH64_LDOP_XOR, /* A ^ B */
14492 AARCH64_LDOP_OR, /* A | B */
14493 AARCH64_LDOP_BIC /* A & ~B */
14494 };
14495
14496 /* Emit an atomic load-operate. */
14497
14498 static void
14499 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14500 machine_mode mode, rtx dst, rtx src,
14501 rtx mem, rtx model)
14502 {
14503 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14504 const aarch64_atomic_load_op_fn plus[] =
14505 {
14506 gen_aarch64_atomic_loadaddqi,
14507 gen_aarch64_atomic_loadaddhi,
14508 gen_aarch64_atomic_loadaddsi,
14509 gen_aarch64_atomic_loadadddi
14510 };
14511 const aarch64_atomic_load_op_fn eor[] =
14512 {
14513 gen_aarch64_atomic_loadeorqi,
14514 gen_aarch64_atomic_loadeorhi,
14515 gen_aarch64_atomic_loadeorsi,
14516 gen_aarch64_atomic_loadeordi
14517 };
14518 const aarch64_atomic_load_op_fn ior[] =
14519 {
14520 gen_aarch64_atomic_loadsetqi,
14521 gen_aarch64_atomic_loadsethi,
14522 gen_aarch64_atomic_loadsetsi,
14523 gen_aarch64_atomic_loadsetdi
14524 };
14525 const aarch64_atomic_load_op_fn bic[] =
14526 {
14527 gen_aarch64_atomic_loadclrqi,
14528 gen_aarch64_atomic_loadclrhi,
14529 gen_aarch64_atomic_loadclrsi,
14530 gen_aarch64_atomic_loadclrdi
14531 };
14532 aarch64_atomic_load_op_fn gen;
14533 int idx = 0;
14534
14535 switch (mode)
14536 {
14537 case E_QImode: idx = 0; break;
14538 case E_HImode: idx = 1; break;
14539 case E_SImode: idx = 2; break;
14540 case E_DImode: idx = 3; break;
14541 default:
14542 gcc_unreachable ();
14543 }
14544
14545 switch (code)
14546 {
14547 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14548 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14549 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14550 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14551 default:
14552 gcc_unreachable ();
14553 }
14554
14555 emit_insn (gen (dst, mem, src, model));
14556 }
14557
14558 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14559 location to store the data read from memory. OUT_RESULT is the location to
14560 store the result of the operation. MEM is the memory location to read and
14561 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14562 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14563 be NULL. */
14564
14565 void
14566 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14567 rtx mem, rtx value, rtx model_rtx)
14568 {
14569 machine_mode mode = GET_MODE (mem);
14570 machine_mode wmode = (mode == DImode ? DImode : SImode);
14571 const bool short_mode = (mode < SImode);
14572 aarch64_atomic_load_op_code ldop_code;
14573 rtx src;
14574 rtx x;
14575
14576 if (out_data)
14577 out_data = gen_lowpart (mode, out_data);
14578
14579 if (out_result)
14580 out_result = gen_lowpart (mode, out_result);
14581
14582 /* Make sure the value is in a register, putting it into a destination
14583 register if it needs to be manipulated. */
14584 if (!register_operand (value, mode)
14585 || code == AND || code == MINUS)
14586 {
14587 src = out_result ? out_result : out_data;
14588 emit_move_insn (src, gen_lowpart (mode, value));
14589 }
14590 else
14591 src = value;
14592 gcc_assert (register_operand (src, mode));
14593
14594 /* Preprocess the data for the operation as necessary. If the operation is
14595 a SET then emit a swap instruction and finish. */
14596 switch (code)
14597 {
14598 case SET:
14599 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14600 return;
14601
14602 case MINUS:
14603 /* Negate the value and treat it as a PLUS. */
14604 {
14605 rtx neg_src;
14606
14607 /* Resize the value if necessary. */
14608 if (short_mode)
14609 src = gen_lowpart (wmode, src);
14610
14611 neg_src = gen_rtx_NEG (wmode, src);
14612 emit_insn (gen_rtx_SET (src, neg_src));
14613
14614 if (short_mode)
14615 src = gen_lowpart (mode, src);
14616 }
14617 /* Fall-through. */
14618 case PLUS:
14619 ldop_code = AARCH64_LDOP_PLUS;
14620 break;
14621
14622 case IOR:
14623 ldop_code = AARCH64_LDOP_OR;
14624 break;
14625
14626 case XOR:
14627 ldop_code = AARCH64_LDOP_XOR;
14628 break;
14629
14630 case AND:
14631 {
14632 rtx not_src;
14633
14634 /* Resize the value if necessary. */
14635 if (short_mode)
14636 src = gen_lowpart (wmode, src);
14637
14638 not_src = gen_rtx_NOT (wmode, src);
14639 emit_insn (gen_rtx_SET (src, not_src));
14640
14641 if (short_mode)
14642 src = gen_lowpart (mode, src);
14643 }
14644 ldop_code = AARCH64_LDOP_BIC;
14645 break;
14646
14647 default:
14648 /* The operation can't be done with atomic instructions. */
14649 gcc_unreachable ();
14650 }
14651
14652 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14653
14654 /* If necessary, calculate the data in memory after the update by redoing the
14655 operation from values in registers. */
14656 if (!out_result)
14657 return;
14658
14659 if (short_mode)
14660 {
14661 src = gen_lowpart (wmode, src);
14662 out_data = gen_lowpart (wmode, out_data);
14663 out_result = gen_lowpart (wmode, out_result);
14664 }
14665
14666 x = NULL_RTX;
14667
14668 switch (code)
14669 {
14670 case MINUS:
14671 case PLUS:
14672 x = gen_rtx_PLUS (wmode, out_data, src);
14673 break;
14674 case IOR:
14675 x = gen_rtx_IOR (wmode, out_data, src);
14676 break;
14677 case XOR:
14678 x = gen_rtx_XOR (wmode, out_data, src);
14679 break;
14680 case AND:
14681 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14682 return;
14683 default:
14684 gcc_unreachable ();
14685 }
14686
14687 emit_set_insn (out_result, x);
14688
14689 return;
14690 }
14691
14692 /* Split an atomic operation. */
14693
14694 void
14695 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14696 rtx value, rtx model_rtx, rtx cond)
14697 {
14698 machine_mode mode = GET_MODE (mem);
14699 machine_mode wmode = (mode == DImode ? DImode : SImode);
14700 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14701 const bool is_sync = is_mm_sync (model);
14702 rtx_code_label *label;
14703 rtx x;
14704
14705 /* Split the atomic operation into a sequence. */
14706 label = gen_label_rtx ();
14707 emit_label (label);
14708
14709 if (new_out)
14710 new_out = gen_lowpart (wmode, new_out);
14711 if (old_out)
14712 old_out = gen_lowpart (wmode, old_out);
14713 else
14714 old_out = new_out;
14715 value = simplify_gen_subreg (wmode, value, mode, 0);
14716
14717 /* The initial load can be relaxed for a __sync operation since a final
14718 barrier will be emitted to stop code hoisting. */
14719 if (is_sync)
14720 aarch64_emit_load_exclusive (mode, old_out, mem,
14721 GEN_INT (MEMMODEL_RELAXED));
14722 else
14723 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14724
14725 switch (code)
14726 {
14727 case SET:
14728 new_out = value;
14729 break;
14730
14731 case NOT:
14732 x = gen_rtx_AND (wmode, old_out, value);
14733 emit_insn (gen_rtx_SET (new_out, x));
14734 x = gen_rtx_NOT (wmode, new_out);
14735 emit_insn (gen_rtx_SET (new_out, x));
14736 break;
14737
14738 case MINUS:
14739 if (CONST_INT_P (value))
14740 {
14741 value = GEN_INT (-INTVAL (value));
14742 code = PLUS;
14743 }
14744 /* Fall through. */
14745
14746 default:
14747 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14748 emit_insn (gen_rtx_SET (new_out, x));
14749 break;
14750 }
14751
14752 aarch64_emit_store_exclusive (mode, cond, mem,
14753 gen_lowpart (mode, new_out), model_rtx);
14754
14755 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14756 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14757 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14758 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14759
14760 /* Emit any final barrier needed for a __sync operation. */
14761 if (is_sync)
14762 aarch64_emit_post_barrier (model);
14763 }
14764
14765 static void
14766 aarch64_init_libfuncs (void)
14767 {
14768 /* Half-precision float operations. The compiler handles all operations
14769 with NULL libfuncs by converting to SFmode. */
14770
14771 /* Conversions. */
14772 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14773 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14774
14775 /* Arithmetic. */
14776 set_optab_libfunc (add_optab, HFmode, NULL);
14777 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14778 set_optab_libfunc (smul_optab, HFmode, NULL);
14779 set_optab_libfunc (neg_optab, HFmode, NULL);
14780 set_optab_libfunc (sub_optab, HFmode, NULL);
14781
14782 /* Comparisons. */
14783 set_optab_libfunc (eq_optab, HFmode, NULL);
14784 set_optab_libfunc (ne_optab, HFmode, NULL);
14785 set_optab_libfunc (lt_optab, HFmode, NULL);
14786 set_optab_libfunc (le_optab, HFmode, NULL);
14787 set_optab_libfunc (ge_optab, HFmode, NULL);
14788 set_optab_libfunc (gt_optab, HFmode, NULL);
14789 set_optab_libfunc (unord_optab, HFmode, NULL);
14790 }
14791
14792 /* Target hook for c_mode_for_suffix. */
14793 static machine_mode
14794 aarch64_c_mode_for_suffix (char suffix)
14795 {
14796 if (suffix == 'q')
14797 return TFmode;
14798
14799 return VOIDmode;
14800 }
14801
14802 /* We can only represent floating point constants which will fit in
14803 "quarter-precision" values. These values are characterised by
14804 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14805 by:
14806
14807 (-1)^s * (n/16) * 2^r
14808
14809 Where:
14810 's' is the sign bit.
14811 'n' is an integer in the range 16 <= n <= 31.
14812 'r' is an integer in the range -3 <= r <= 4. */
14813
14814 /* Return true iff X can be represented by a quarter-precision
14815 floating point immediate operand X. Note, we cannot represent 0.0. */
14816 bool
14817 aarch64_float_const_representable_p (rtx x)
14818 {
14819 /* This represents our current view of how many bits
14820 make up the mantissa. */
14821 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14822 int exponent;
14823 unsigned HOST_WIDE_INT mantissa, mask;
14824 REAL_VALUE_TYPE r, m;
14825 bool fail;
14826
14827 if (!CONST_DOUBLE_P (x))
14828 return false;
14829
14830 /* We don't support HFmode constants yet. */
14831 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14832 return false;
14833
14834 r = *CONST_DOUBLE_REAL_VALUE (x);
14835
14836 /* We cannot represent infinities, NaNs or +/-zero. We won't
14837 know if we have +zero until we analyse the mantissa, but we
14838 can reject the other invalid values. */
14839 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14840 || REAL_VALUE_MINUS_ZERO (r))
14841 return false;
14842
14843 /* Extract exponent. */
14844 r = real_value_abs (&r);
14845 exponent = REAL_EXP (&r);
14846
14847 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14848 highest (sign) bit, with a fixed binary point at bit point_pos.
14849 m1 holds the low part of the mantissa, m2 the high part.
14850 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14851 bits for the mantissa, this can fail (low bits will be lost). */
14852 real_ldexp (&m, &r, point_pos - exponent);
14853 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14854
14855 /* If the low part of the mantissa has bits set we cannot represent
14856 the value. */
14857 if (w.ulow () != 0)
14858 return false;
14859 /* We have rejected the lower HOST_WIDE_INT, so update our
14860 understanding of how many bits lie in the mantissa and
14861 look only at the high HOST_WIDE_INT. */
14862 mantissa = w.elt (1);
14863 point_pos -= HOST_BITS_PER_WIDE_INT;
14864
14865 /* We can only represent values with a mantissa of the form 1.xxxx. */
14866 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14867 if ((mantissa & mask) != 0)
14868 return false;
14869
14870 /* Having filtered unrepresentable values, we may now remove all
14871 but the highest 5 bits. */
14872 mantissa >>= point_pos - 5;
14873
14874 /* We cannot represent the value 0.0, so reject it. This is handled
14875 elsewhere. */
14876 if (mantissa == 0)
14877 return false;
14878
14879 /* Then, as bit 4 is always set, we can mask it off, leaving
14880 the mantissa in the range [0, 15]. */
14881 mantissa &= ~(1 << 4);
14882 gcc_assert (mantissa <= 15);
14883
14884 /* GCC internally does not use IEEE754-like encoding (where normalized
14885 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14886 Our mantissa values are shifted 4 places to the left relative to
14887 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14888 by 5 places to correct for GCC's representation. */
14889 exponent = 5 - exponent;
14890
14891 return (exponent >= 0 && exponent <= 7);
14892 }
14893
14894 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14895 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14896 output MOVI/MVNI, ORR or BIC immediate. */
14897 char*
14898 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14899 enum simd_immediate_check which)
14900 {
14901 bool is_valid;
14902 static char templ[40];
14903 const char *mnemonic;
14904 const char *shift_op;
14905 unsigned int lane_count = 0;
14906 char element_char;
14907
14908 struct simd_immediate_info info;
14909
14910 /* This will return true to show const_vector is legal for use as either
14911 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14912 It will also update INFO to show how the immediate should be generated.
14913 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14914 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14915 gcc_assert (is_valid);
14916
14917 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14918 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14919
14920 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14921 {
14922 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14923 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14924 move immediate path. */
14925 if (aarch64_float_const_zero_rtx_p (info.value))
14926 info.value = GEN_INT (0);
14927 else
14928 {
14929 const unsigned int buf_size = 20;
14930 char float_buf[buf_size] = {'\0'};
14931 real_to_decimal_for_mode (float_buf,
14932 CONST_DOUBLE_REAL_VALUE (info.value),
14933 buf_size, buf_size, 1, info.elt_mode);
14934
14935 if (lane_count == 1)
14936 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14937 else
14938 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14939 lane_count, element_char, float_buf);
14940 return templ;
14941 }
14942 }
14943
14944 gcc_assert (CONST_INT_P (info.value));
14945
14946 if (which == AARCH64_CHECK_MOV)
14947 {
14948 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14949 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14950 if (lane_count == 1)
14951 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14952 mnemonic, UINTVAL (info.value));
14953 else if (info.shift)
14954 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14955 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14956 element_char, UINTVAL (info.value), shift_op, info.shift);
14957 else
14958 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14959 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14960 element_char, UINTVAL (info.value));
14961 }
14962 else
14963 {
14964 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14965 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14966 if (info.shift)
14967 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14968 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14969 element_char, UINTVAL (info.value), "lsl", info.shift);
14970 else
14971 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14972 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14973 element_char, UINTVAL (info.value));
14974 }
14975 return templ;
14976 }
14977
14978 char*
14979 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14980 {
14981
14982 /* If a floating point number was passed and we desire to use it in an
14983 integer mode do the conversion to integer. */
14984 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14985 {
14986 unsigned HOST_WIDE_INT ival;
14987 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14988 gcc_unreachable ();
14989 immediate = gen_int_mode (ival, mode);
14990 }
14991
14992 machine_mode vmode;
14993 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14994 a 128 bit vector mode. */
14995 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14996
14997 vmode = aarch64_simd_container_mode (mode, width);
14998 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14999 return aarch64_output_simd_mov_immediate (v_op, width);
15000 }
15001
15002 /* Return the output string to use for moving immediate CONST_VECTOR
15003 into an SVE register. */
15004
15005 char *
15006 aarch64_output_sve_mov_immediate (rtx const_vector)
15007 {
15008 static char templ[40];
15009 struct simd_immediate_info info;
15010 char element_char;
15011
15012 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15013 gcc_assert (is_valid);
15014
15015 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15016
15017 if (info.step)
15018 {
15019 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15020 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15021 element_char, INTVAL (info.value), INTVAL (info.step));
15022 return templ;
15023 }
15024
15025 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15026 {
15027 if (aarch64_float_const_zero_rtx_p (info.value))
15028 info.value = GEN_INT (0);
15029 else
15030 {
15031 const int buf_size = 20;
15032 char float_buf[buf_size] = {};
15033 real_to_decimal_for_mode (float_buf,
15034 CONST_DOUBLE_REAL_VALUE (info.value),
15035 buf_size, buf_size, 1, info.elt_mode);
15036
15037 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15038 element_char, float_buf);
15039 return templ;
15040 }
15041 }
15042
15043 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15044 element_char, INTVAL (info.value));
15045 return templ;
15046 }
15047
15048 /* Return the asm format for a PTRUE instruction whose destination has
15049 mode MODE. SUFFIX is the element size suffix. */
15050
15051 char *
15052 aarch64_output_ptrue (machine_mode mode, char suffix)
15053 {
15054 unsigned int nunits;
15055 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15056 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15057 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15058 else
15059 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15060 return buf;
15061 }
15062
15063 /* Split operands into moves from op[1] + op[2] into op[0]. */
15064
15065 void
15066 aarch64_split_combinev16qi (rtx operands[3])
15067 {
15068 unsigned int dest = REGNO (operands[0]);
15069 unsigned int src1 = REGNO (operands[1]);
15070 unsigned int src2 = REGNO (operands[2]);
15071 machine_mode halfmode = GET_MODE (operands[1]);
15072 unsigned int halfregs = REG_NREGS (operands[1]);
15073 rtx destlo, desthi;
15074
15075 gcc_assert (halfmode == V16QImode);
15076
15077 if (src1 == dest && src2 == dest + halfregs)
15078 {
15079 /* No-op move. Can't split to nothing; emit something. */
15080 emit_note (NOTE_INSN_DELETED);
15081 return;
15082 }
15083
15084 /* Preserve register attributes for variable tracking. */
15085 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15086 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15087 GET_MODE_SIZE (halfmode));
15088
15089 /* Special case of reversed high/low parts. */
15090 if (reg_overlap_mentioned_p (operands[2], destlo)
15091 && reg_overlap_mentioned_p (operands[1], desthi))
15092 {
15093 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15094 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15095 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15096 }
15097 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15098 {
15099 /* Try to avoid unnecessary moves if part of the result
15100 is in the right place already. */
15101 if (src1 != dest)
15102 emit_move_insn (destlo, operands[1]);
15103 if (src2 != dest + halfregs)
15104 emit_move_insn (desthi, operands[2]);
15105 }
15106 else
15107 {
15108 if (src2 != dest + halfregs)
15109 emit_move_insn (desthi, operands[2]);
15110 if (src1 != dest)
15111 emit_move_insn (destlo, operands[1]);
15112 }
15113 }
15114
15115 /* vec_perm support. */
15116
15117 struct expand_vec_perm_d
15118 {
15119 rtx target, op0, op1;
15120 vec_perm_indices perm;
15121 machine_mode vmode;
15122 unsigned int vec_flags;
15123 bool one_vector_p;
15124 bool testing_p;
15125 };
15126
15127 /* Generate a variable permutation. */
15128
15129 static void
15130 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15131 {
15132 machine_mode vmode = GET_MODE (target);
15133 bool one_vector_p = rtx_equal_p (op0, op1);
15134
15135 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15136 gcc_checking_assert (GET_MODE (op0) == vmode);
15137 gcc_checking_assert (GET_MODE (op1) == vmode);
15138 gcc_checking_assert (GET_MODE (sel) == vmode);
15139 gcc_checking_assert (TARGET_SIMD);
15140
15141 if (one_vector_p)
15142 {
15143 if (vmode == V8QImode)
15144 {
15145 /* Expand the argument to a V16QI mode by duplicating it. */
15146 rtx pair = gen_reg_rtx (V16QImode);
15147 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15148 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15149 }
15150 else
15151 {
15152 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15153 }
15154 }
15155 else
15156 {
15157 rtx pair;
15158
15159 if (vmode == V8QImode)
15160 {
15161 pair = gen_reg_rtx (V16QImode);
15162 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15163 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15164 }
15165 else
15166 {
15167 pair = gen_reg_rtx (OImode);
15168 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15169 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15170 }
15171 }
15172 }
15173
15174 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15175 NELT is the number of elements in the vector. */
15176
15177 void
15178 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15179 unsigned int nelt)
15180 {
15181 machine_mode vmode = GET_MODE (target);
15182 bool one_vector_p = rtx_equal_p (op0, op1);
15183 rtx mask;
15184
15185 /* The TBL instruction does not use a modulo index, so we must take care
15186 of that ourselves. */
15187 mask = aarch64_simd_gen_const_vector_dup (vmode,
15188 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15189 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15190
15191 /* For big-endian, we also need to reverse the index within the vector
15192 (but not which vector). */
15193 if (BYTES_BIG_ENDIAN)
15194 {
15195 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15196 if (!one_vector_p)
15197 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15198 sel = expand_simple_binop (vmode, XOR, sel, mask,
15199 NULL, 0, OPTAB_LIB_WIDEN);
15200 }
15201 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15202 }
15203
15204 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15205
15206 static void
15207 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15208 {
15209 emit_insn (gen_rtx_SET (target,
15210 gen_rtx_UNSPEC (GET_MODE (target),
15211 gen_rtvec (2, op0, op1), code)));
15212 }
15213
15214 /* Expand an SVE vec_perm with the given operands. */
15215
15216 void
15217 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15218 {
15219 machine_mode data_mode = GET_MODE (target);
15220 machine_mode sel_mode = GET_MODE (sel);
15221 /* Enforced by the pattern condition. */
15222 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15223
15224 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15225 size of the two value vectors, i.e. the upper bits of the indices
15226 are effectively ignored. SVE TBL instead produces 0 for any
15227 out-of-range indices, so we need to modulo all the vec_perm indices
15228 to ensure they are all in range. */
15229 rtx sel_reg = force_reg (sel_mode, sel);
15230
15231 /* Check if the sel only references the first values vector. */
15232 if (GET_CODE (sel) == CONST_VECTOR
15233 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15234 {
15235 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15236 return;
15237 }
15238
15239 /* Check if the two values vectors are the same. */
15240 if (rtx_equal_p (op0, op1))
15241 {
15242 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15243 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15244 NULL, 0, OPTAB_DIRECT);
15245 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15246 return;
15247 }
15248
15249 /* Run TBL on for each value vector and combine the results. */
15250
15251 rtx res0 = gen_reg_rtx (data_mode);
15252 rtx res1 = gen_reg_rtx (data_mode);
15253 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15254 if (GET_CODE (sel) != CONST_VECTOR
15255 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15256 {
15257 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15258 2 * nunits - 1);
15259 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15260 NULL, 0, OPTAB_DIRECT);
15261 }
15262 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15263 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15264 NULL, 0, OPTAB_DIRECT);
15265 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15266 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15267 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15268 else
15269 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15270 }
15271
15272 /* Recognize patterns suitable for the TRN instructions. */
15273 static bool
15274 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15275 {
15276 HOST_WIDE_INT odd;
15277 poly_uint64 nelt = d->perm.length ();
15278 rtx out, in0, in1, x;
15279 machine_mode vmode = d->vmode;
15280
15281 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15282 return false;
15283
15284 /* Note that these are little-endian tests.
15285 We correct for big-endian later. */
15286 if (!d->perm[0].is_constant (&odd)
15287 || (odd != 0 && odd != 1)
15288 || !d->perm.series_p (0, 2, odd, 2)
15289 || !d->perm.series_p (1, 2, nelt + odd, 2))
15290 return false;
15291
15292 /* Success! */
15293 if (d->testing_p)
15294 return true;
15295
15296 in0 = d->op0;
15297 in1 = d->op1;
15298 /* We don't need a big-endian lane correction for SVE; see the comment
15299 at the head of aarch64-sve.md for details. */
15300 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15301 {
15302 x = in0, in0 = in1, in1 = x;
15303 odd = !odd;
15304 }
15305 out = d->target;
15306
15307 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15308 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15309 return true;
15310 }
15311
15312 /* Recognize patterns suitable for the UZP instructions. */
15313 static bool
15314 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15315 {
15316 HOST_WIDE_INT odd;
15317 rtx out, in0, in1, x;
15318 machine_mode vmode = d->vmode;
15319
15320 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15321 return false;
15322
15323 /* Note that these are little-endian tests.
15324 We correct for big-endian later. */
15325 if (!d->perm[0].is_constant (&odd)
15326 || (odd != 0 && odd != 1)
15327 || !d->perm.series_p (0, 1, odd, 2))
15328 return false;
15329
15330 /* Success! */
15331 if (d->testing_p)
15332 return true;
15333
15334 in0 = d->op0;
15335 in1 = d->op1;
15336 /* We don't need a big-endian lane correction for SVE; see the comment
15337 at the head of aarch64-sve.md for details. */
15338 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15339 {
15340 x = in0, in0 = in1, in1 = x;
15341 odd = !odd;
15342 }
15343 out = d->target;
15344
15345 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15346 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15347 return true;
15348 }
15349
15350 /* Recognize patterns suitable for the ZIP instructions. */
15351 static bool
15352 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15353 {
15354 unsigned int high;
15355 poly_uint64 nelt = d->perm.length ();
15356 rtx out, in0, in1, x;
15357 machine_mode vmode = d->vmode;
15358
15359 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15360 return false;
15361
15362 /* Note that these are little-endian tests.
15363 We correct for big-endian later. */
15364 poly_uint64 first = d->perm[0];
15365 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15366 || !d->perm.series_p (0, 2, first, 1)
15367 || !d->perm.series_p (1, 2, first + nelt, 1))
15368 return false;
15369 high = maybe_ne (first, 0U);
15370
15371 /* Success! */
15372 if (d->testing_p)
15373 return true;
15374
15375 in0 = d->op0;
15376 in1 = d->op1;
15377 /* We don't need a big-endian lane correction for SVE; see the comment
15378 at the head of aarch64-sve.md for details. */
15379 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15380 {
15381 x = in0, in0 = in1, in1 = x;
15382 high = !high;
15383 }
15384 out = d->target;
15385
15386 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15387 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15388 return true;
15389 }
15390
15391 /* Recognize patterns for the EXT insn. */
15392
15393 static bool
15394 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15395 {
15396 HOST_WIDE_INT location;
15397 rtx offset;
15398
15399 /* The first element always refers to the first vector.
15400 Check if the extracted indices are increasing by one. */
15401 if (d->vec_flags == VEC_SVE_PRED
15402 || !d->perm[0].is_constant (&location)
15403 || !d->perm.series_p (0, 1, location, 1))
15404 return false;
15405
15406 /* Success! */
15407 if (d->testing_p)
15408 return true;
15409
15410 /* The case where (location == 0) is a no-op for both big- and little-endian,
15411 and is removed by the mid-end at optimization levels -O1 and higher.
15412
15413 We don't need a big-endian lane correction for SVE; see the comment
15414 at the head of aarch64-sve.md for details. */
15415 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15416 {
15417 /* After setup, we want the high elements of the first vector (stored
15418 at the LSB end of the register), and the low elements of the second
15419 vector (stored at the MSB end of the register). So swap. */
15420 std::swap (d->op0, d->op1);
15421 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15422 to_constant () is safe since this is restricted to Advanced SIMD
15423 vectors. */
15424 location = d->perm.length ().to_constant () - location;
15425 }
15426
15427 offset = GEN_INT (location);
15428 emit_set_insn (d->target,
15429 gen_rtx_UNSPEC (d->vmode,
15430 gen_rtvec (3, d->op0, d->op1, offset),
15431 UNSPEC_EXT));
15432 return true;
15433 }
15434
15435 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15436 within each 64-bit, 32-bit or 16-bit granule. */
15437
15438 static bool
15439 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15440 {
15441 HOST_WIDE_INT diff;
15442 unsigned int i, size, unspec;
15443 machine_mode pred_mode;
15444
15445 if (d->vec_flags == VEC_SVE_PRED
15446 || !d->one_vector_p
15447 || !d->perm[0].is_constant (&diff))
15448 return false;
15449
15450 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15451 if (size == 8)
15452 {
15453 unspec = UNSPEC_REV64;
15454 pred_mode = VNx2BImode;
15455 }
15456 else if (size == 4)
15457 {
15458 unspec = UNSPEC_REV32;
15459 pred_mode = VNx4BImode;
15460 }
15461 else if (size == 2)
15462 {
15463 unspec = UNSPEC_REV16;
15464 pred_mode = VNx8BImode;
15465 }
15466 else
15467 return false;
15468
15469 unsigned int step = diff + 1;
15470 for (i = 0; i < step; ++i)
15471 if (!d->perm.series_p (i, step, diff - i, step))
15472 return false;
15473
15474 /* Success! */
15475 if (d->testing_p)
15476 return true;
15477
15478 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15479 if (d->vec_flags == VEC_SVE_DATA)
15480 {
15481 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15482 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15483 UNSPEC_MERGE_PTRUE);
15484 }
15485 emit_set_insn (d->target, src);
15486 return true;
15487 }
15488
15489 /* Recognize patterns for the REV insn, which reverses elements within
15490 a full vector. */
15491
15492 static bool
15493 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15494 {
15495 poly_uint64 nelt = d->perm.length ();
15496
15497 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15498 return false;
15499
15500 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15501 return false;
15502
15503 /* Success! */
15504 if (d->testing_p)
15505 return true;
15506
15507 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15508 emit_set_insn (d->target, src);
15509 return true;
15510 }
15511
15512 static bool
15513 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15514 {
15515 rtx out = d->target;
15516 rtx in0;
15517 HOST_WIDE_INT elt;
15518 machine_mode vmode = d->vmode;
15519 rtx lane;
15520
15521 if (d->vec_flags == VEC_SVE_PRED
15522 || d->perm.encoding ().encoded_nelts () != 1
15523 || !d->perm[0].is_constant (&elt))
15524 return false;
15525
15526 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15527 return false;
15528
15529 /* Success! */
15530 if (d->testing_p)
15531 return true;
15532
15533 /* The generic preparation in aarch64_expand_vec_perm_const_1
15534 swaps the operand order and the permute indices if it finds
15535 d->perm[0] to be in the second operand. Thus, we can always
15536 use d->op0 and need not do any extra arithmetic to get the
15537 correct lane number. */
15538 in0 = d->op0;
15539 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15540
15541 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15542 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15543 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15544 return true;
15545 }
15546
15547 static bool
15548 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15549 {
15550 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15551 machine_mode vmode = d->vmode;
15552
15553 /* Make sure that the indices are constant. */
15554 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15555 for (unsigned int i = 0; i < encoded_nelts; ++i)
15556 if (!d->perm[i].is_constant ())
15557 return false;
15558
15559 if (d->testing_p)
15560 return true;
15561
15562 /* Generic code will try constant permutation twice. Once with the
15563 original mode and again with the elements lowered to QImode.
15564 So wait and don't do the selector expansion ourselves. */
15565 if (vmode != V8QImode && vmode != V16QImode)
15566 return false;
15567
15568 /* to_constant is safe since this routine is specific to Advanced SIMD
15569 vectors. */
15570 unsigned int nelt = d->perm.length ().to_constant ();
15571 for (unsigned int i = 0; i < nelt; ++i)
15572 /* If big-endian and two vectors we end up with a weird mixed-endian
15573 mode on NEON. Reverse the index within each word but not the word
15574 itself. to_constant is safe because we checked is_constant above. */
15575 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15576 ? d->perm[i].to_constant () ^ (nelt - 1)
15577 : d->perm[i].to_constant ());
15578
15579 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15580 sel = force_reg (vmode, sel);
15581
15582 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15583 return true;
15584 }
15585
15586 /* Try to implement D using an SVE TBL instruction. */
15587
15588 static bool
15589 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15590 {
15591 unsigned HOST_WIDE_INT nelt;
15592
15593 /* Permuting two variable-length vectors could overflow the
15594 index range. */
15595 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15596 return false;
15597
15598 if (d->testing_p)
15599 return true;
15600
15601 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15602 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15603 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15604 return true;
15605 }
15606
15607 static bool
15608 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15609 {
15610 /* The pattern matching functions above are written to look for a small
15611 number to begin the sequence (0, 1, N/2). If we begin with an index
15612 from the second operand, we can swap the operands. */
15613 poly_int64 nelt = d->perm.length ();
15614 if (known_ge (d->perm[0], nelt))
15615 {
15616 d->perm.rotate_inputs (1);
15617 std::swap (d->op0, d->op1);
15618 }
15619
15620 if ((d->vec_flags == VEC_ADVSIMD
15621 || d->vec_flags == VEC_SVE_DATA
15622 || d->vec_flags == VEC_SVE_PRED)
15623 && known_gt (nelt, 1))
15624 {
15625 if (aarch64_evpc_rev_local (d))
15626 return true;
15627 else if (aarch64_evpc_rev_global (d))
15628 return true;
15629 else if (aarch64_evpc_ext (d))
15630 return true;
15631 else if (aarch64_evpc_dup (d))
15632 return true;
15633 else if (aarch64_evpc_zip (d))
15634 return true;
15635 else if (aarch64_evpc_uzp (d))
15636 return true;
15637 else if (aarch64_evpc_trn (d))
15638 return true;
15639 if (d->vec_flags == VEC_SVE_DATA)
15640 return aarch64_evpc_sve_tbl (d);
15641 else if (d->vec_flags == VEC_SVE_DATA)
15642 return aarch64_evpc_tbl (d);
15643 }
15644 return false;
15645 }
15646
15647 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15648
15649 static bool
15650 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15651 rtx op1, const vec_perm_indices &sel)
15652 {
15653 struct expand_vec_perm_d d;
15654
15655 /* Check whether the mask can be applied to a single vector. */
15656 if (op0 && rtx_equal_p (op0, op1))
15657 d.one_vector_p = true;
15658 else if (sel.all_from_input_p (0))
15659 {
15660 d.one_vector_p = true;
15661 op1 = op0;
15662 }
15663 else if (sel.all_from_input_p (1))
15664 {
15665 d.one_vector_p = true;
15666 op0 = op1;
15667 }
15668 else
15669 d.one_vector_p = false;
15670
15671 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15672 sel.nelts_per_input ());
15673 d.vmode = vmode;
15674 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15675 d.target = target;
15676 d.op0 = op0;
15677 d.op1 = op1;
15678 d.testing_p = !target;
15679
15680 if (!d.testing_p)
15681 return aarch64_expand_vec_perm_const_1 (&d);
15682
15683 rtx_insn *last = get_last_insn ();
15684 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15685 gcc_assert (last == get_last_insn ());
15686
15687 return ret;
15688 }
15689
15690 /* Generate a byte permute mask for a register of mode MODE,
15691 which has NUNITS units. */
15692
15693 rtx
15694 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15695 {
15696 /* We have to reverse each vector because we dont have
15697 a permuted load that can reverse-load according to ABI rules. */
15698 rtx mask;
15699 rtvec v = rtvec_alloc (16);
15700 unsigned int i, j;
15701 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15702
15703 gcc_assert (BYTES_BIG_ENDIAN);
15704 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15705
15706 for (i = 0; i < nunits; i++)
15707 for (j = 0; j < usize; j++)
15708 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15709 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15710 return force_reg (V16QImode, mask);
15711 }
15712
15713 /* Return true if X is a valid second operand for the SVE instruction
15714 that implements integer comparison OP_CODE. */
15715
15716 static bool
15717 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15718 {
15719 if (register_operand (x, VOIDmode))
15720 return true;
15721
15722 switch (op_code)
15723 {
15724 case LTU:
15725 case LEU:
15726 case GEU:
15727 case GTU:
15728 return aarch64_sve_cmp_immediate_p (x, false);
15729 case LT:
15730 case LE:
15731 case GE:
15732 case GT:
15733 case NE:
15734 case EQ:
15735 return aarch64_sve_cmp_immediate_p (x, true);
15736 default:
15737 gcc_unreachable ();
15738 }
15739 }
15740
15741 /* Use predicated SVE instructions to implement the equivalent of:
15742
15743 (set TARGET OP)
15744
15745 given that PTRUE is an all-true predicate of the appropriate mode. */
15746
15747 static void
15748 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15749 {
15750 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15751 gen_rtvec (2, ptrue, op),
15752 UNSPEC_MERGE_PTRUE);
15753 rtx_insn *insn = emit_set_insn (target, unspec);
15754 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15755 }
15756
15757 /* Likewise, but also clobber the condition codes. */
15758
15759 static void
15760 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15761 {
15762 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15763 gen_rtvec (2, ptrue, op),
15764 UNSPEC_MERGE_PTRUE);
15765 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15766 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15767 }
15768
15769 /* Return the UNSPEC_COND_* code for comparison CODE. */
15770
15771 static unsigned int
15772 aarch64_unspec_cond_code (rtx_code code)
15773 {
15774 switch (code)
15775 {
15776 case NE:
15777 return UNSPEC_COND_NE;
15778 case EQ:
15779 return UNSPEC_COND_EQ;
15780 case LT:
15781 return UNSPEC_COND_LT;
15782 case GT:
15783 return UNSPEC_COND_GT;
15784 case LE:
15785 return UNSPEC_COND_LE;
15786 case GE:
15787 return UNSPEC_COND_GE;
15788 default:
15789 gcc_unreachable ();
15790 }
15791 }
15792
15793 /* Emit:
15794
15795 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15796
15797 where <X> is the operation associated with comparison CODE. This form
15798 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15799 semantics, such as when PRED might not be all-true and when comparing
15800 inactive lanes could have side effects. */
15801
15802 static void
15803 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15804 rtx pred, rtx op0, rtx op1)
15805 {
15806 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15807 gen_rtvec (3, pred, op0, op1),
15808 aarch64_unspec_cond_code (code));
15809 emit_set_insn (target, unspec);
15810 }
15811
15812 /* Expand an SVE integer comparison using the SVE equivalent of:
15813
15814 (set TARGET (CODE OP0 OP1)). */
15815
15816 void
15817 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15818 {
15819 machine_mode pred_mode = GET_MODE (target);
15820 machine_mode data_mode = GET_MODE (op0);
15821
15822 if (!aarch64_sve_cmp_operand_p (code, op1))
15823 op1 = force_reg (data_mode, op1);
15824
15825 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15826 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15827 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15828 }
15829
15830 /* Emit the SVE equivalent of:
15831
15832 (set TMP1 (CODE1 OP0 OP1))
15833 (set TMP2 (CODE2 OP0 OP1))
15834 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15835
15836 PTRUE is an all-true predicate with the same mode as TARGET. */
15837
15838 static void
15839 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15840 rtx ptrue, rtx op0, rtx op1)
15841 {
15842 machine_mode pred_mode = GET_MODE (ptrue);
15843 rtx tmp1 = gen_reg_rtx (pred_mode);
15844 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15845 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15846 rtx tmp2 = gen_reg_rtx (pred_mode);
15847 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15848 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15849 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15850 }
15851
15852 /* Emit the SVE equivalent of:
15853
15854 (set TMP (CODE OP0 OP1))
15855 (set TARGET (not TMP))
15856
15857 PTRUE is an all-true predicate with the same mode as TARGET. */
15858
15859 static void
15860 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15861 rtx op0, rtx op1)
15862 {
15863 machine_mode pred_mode = GET_MODE (ptrue);
15864 rtx tmp = gen_reg_rtx (pred_mode);
15865 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15866 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15867 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15868 }
15869
15870 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15871
15872 (set TARGET (CODE OP0 OP1))
15873
15874 If CAN_INVERT_P is true, the caller can also handle inverted results;
15875 return true if the result is in fact inverted. */
15876
15877 bool
15878 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15879 rtx op0, rtx op1, bool can_invert_p)
15880 {
15881 machine_mode pred_mode = GET_MODE (target);
15882 machine_mode data_mode = GET_MODE (op0);
15883
15884 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15885 switch (code)
15886 {
15887 case UNORDERED:
15888 /* UNORDERED has no immediate form. */
15889 op1 = force_reg (data_mode, op1);
15890 /* fall through */
15891 case LT:
15892 case LE:
15893 case GT:
15894 case GE:
15895 case EQ:
15896 case NE:
15897 {
15898 /* There is native support for the comparison. */
15899 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15900 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15901 return false;
15902 }
15903
15904 case LTGT:
15905 /* This is a trapping operation (LT or GT). */
15906 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15907 return false;
15908
15909 case UNEQ:
15910 if (!flag_trapping_math)
15911 {
15912 /* This would trap for signaling NaNs. */
15913 op1 = force_reg (data_mode, op1);
15914 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15915 return false;
15916 }
15917 /* fall through */
15918 case UNLT:
15919 case UNLE:
15920 case UNGT:
15921 case UNGE:
15922 if (flag_trapping_math)
15923 {
15924 /* Work out which elements are ordered. */
15925 rtx ordered = gen_reg_rtx (pred_mode);
15926 op1 = force_reg (data_mode, op1);
15927 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15928
15929 /* Test the opposite condition for the ordered elements,
15930 then invert the result. */
15931 if (code == UNEQ)
15932 code = NE;
15933 else
15934 code = reverse_condition_maybe_unordered (code);
15935 if (can_invert_p)
15936 {
15937 aarch64_emit_sve_predicated_cond (target, code,
15938 ordered, op0, op1);
15939 return true;
15940 }
15941 rtx tmp = gen_reg_rtx (pred_mode);
15942 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15943 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15944 return false;
15945 }
15946 break;
15947
15948 case ORDERED:
15949 /* ORDERED has no immediate form. */
15950 op1 = force_reg (data_mode, op1);
15951 break;
15952
15953 default:
15954 gcc_unreachable ();
15955 }
15956
15957 /* There is native support for the inverse comparison. */
15958 code = reverse_condition_maybe_unordered (code);
15959 if (can_invert_p)
15960 {
15961 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15962 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15963 return true;
15964 }
15965 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15966 return false;
15967 }
15968
15969 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15970 of the data being selected and CMP_MODE is the mode of the values being
15971 compared. */
15972
15973 void
15974 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15975 rtx *ops)
15976 {
15977 machine_mode pred_mode
15978 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15979 GET_MODE_SIZE (cmp_mode)).require ();
15980 rtx pred = gen_reg_rtx (pred_mode);
15981 if (FLOAT_MODE_P (cmp_mode))
15982 {
15983 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15984 ops[4], ops[5], true))
15985 std::swap (ops[1], ops[2]);
15986 }
15987 else
15988 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15989
15990 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15991 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15992 }
15993
15994 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15995 true. However due to issues with register allocation it is preferable
15996 to avoid tieing integer scalar and FP scalar modes. Executing integer
15997 operations in general registers is better than treating them as scalar
15998 vector operations. This reduces latency and avoids redundant int<->FP
15999 moves. So tie modes if they are either the same class, or vector modes
16000 with other vector modes, vector structs or any scalar mode. */
16001
16002 static bool
16003 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16004 {
16005 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16006 return true;
16007
16008 /* We specifically want to allow elements of "structure" modes to
16009 be tieable to the structure. This more general condition allows
16010 other rarer situations too. The reason we don't extend this to
16011 predicate modes is that there are no predicate structure modes
16012 nor any specific instructions for extracting part of a predicate
16013 register. */
16014 if (aarch64_vector_data_mode_p (mode1)
16015 && aarch64_vector_data_mode_p (mode2))
16016 return true;
16017
16018 /* Also allow any scalar modes with vectors. */
16019 if (aarch64_vector_mode_supported_p (mode1)
16020 || aarch64_vector_mode_supported_p (mode2))
16021 return true;
16022
16023 return false;
16024 }
16025
16026 /* Return a new RTX holding the result of moving POINTER forward by
16027 AMOUNT bytes. */
16028
16029 static rtx
16030 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16031 {
16032 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16033
16034 return adjust_automodify_address (pointer, GET_MODE (pointer),
16035 next, amount);
16036 }
16037
16038 /* Return a new RTX holding the result of moving POINTER forward by the
16039 size of the mode it points to. */
16040
16041 static rtx
16042 aarch64_progress_pointer (rtx pointer)
16043 {
16044 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16045 }
16046
16047 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16048 MODE bytes. */
16049
16050 static void
16051 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16052 machine_mode mode)
16053 {
16054 rtx reg = gen_reg_rtx (mode);
16055
16056 /* "Cast" the pointers to the correct mode. */
16057 *src = adjust_address (*src, mode, 0);
16058 *dst = adjust_address (*dst, mode, 0);
16059 /* Emit the memcpy. */
16060 emit_move_insn (reg, *src);
16061 emit_move_insn (*dst, reg);
16062 /* Move the pointers forward. */
16063 *src = aarch64_progress_pointer (*src);
16064 *dst = aarch64_progress_pointer (*dst);
16065 }
16066
16067 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16068 we succeed, otherwise return false. */
16069
16070 bool
16071 aarch64_expand_movmem (rtx *operands)
16072 {
16073 unsigned int n;
16074 rtx dst = operands[0];
16075 rtx src = operands[1];
16076 rtx base;
16077 bool speed_p = !optimize_function_for_size_p (cfun);
16078
16079 /* When optimizing for size, give a better estimate of the length of a
16080 memcpy call, but use the default otherwise. */
16081 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16082
16083 /* We can't do anything smart if the amount to copy is not constant. */
16084 if (!CONST_INT_P (operands[2]))
16085 return false;
16086
16087 n = UINTVAL (operands[2]);
16088
16089 /* Try to keep the number of instructions low. For cases below 16 bytes we
16090 need to make at most two moves. For cases above 16 bytes it will be one
16091 move for each 16 byte chunk, then at most two additional moves. */
16092 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16093 return false;
16094
16095 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16096 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16097
16098 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16099 src = adjust_automodify_address (src, VOIDmode, base, 0);
16100
16101 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16102 1-byte chunk. */
16103 if (n < 4)
16104 {
16105 if (n >= 2)
16106 {
16107 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16108 n -= 2;
16109 }
16110
16111 if (n == 1)
16112 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16113
16114 return true;
16115 }
16116
16117 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
16118 4-byte chunk, partially overlapping with the previously copied chunk. */
16119 if (n < 8)
16120 {
16121 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16122 n -= 4;
16123 if (n > 0)
16124 {
16125 int move = n - 4;
16126
16127 src = aarch64_move_pointer (src, move);
16128 dst = aarch64_move_pointer (dst, move);
16129 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16130 }
16131 return true;
16132 }
16133
16134 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
16135 them, then (if applicable) an 8-byte chunk. */
16136 while (n >= 8)
16137 {
16138 if (n / 16)
16139 {
16140 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16141 n -= 16;
16142 }
16143 else
16144 {
16145 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16146 n -= 8;
16147 }
16148 }
16149
16150 /* Finish the final bytes of the copy. We can always do this in one
16151 instruction. We either copy the exact amount we need, or partially
16152 overlap with the previous chunk we copied and copy 8-bytes. */
16153 if (n == 0)
16154 return true;
16155 else if (n == 1)
16156 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16157 else if (n == 2)
16158 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16159 else if (n == 4)
16160 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16161 else
16162 {
16163 if (n == 3)
16164 {
16165 src = aarch64_move_pointer (src, -1);
16166 dst = aarch64_move_pointer (dst, -1);
16167 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16168 }
16169 else
16170 {
16171 int move = n - 8;
16172
16173 src = aarch64_move_pointer (src, move);
16174 dst = aarch64_move_pointer (dst, move);
16175 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16176 }
16177 }
16178
16179 return true;
16180 }
16181
16182 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16183 SImode stores. Handle the case when the constant has identical
16184 bottom and top halves. This is beneficial when the two stores can be
16185 merged into an STP and we avoid synthesising potentially expensive
16186 immediates twice. Return true if such a split is possible. */
16187
16188 bool
16189 aarch64_split_dimode_const_store (rtx dst, rtx src)
16190 {
16191 rtx lo = gen_lowpart (SImode, src);
16192 rtx hi = gen_highpart_mode (SImode, DImode, src);
16193
16194 bool size_p = optimize_function_for_size_p (cfun);
16195
16196 if (!rtx_equal_p (lo, hi))
16197 return false;
16198
16199 unsigned int orig_cost
16200 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16201 unsigned int lo_cost
16202 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16203
16204 /* We want to transform:
16205 MOV x1, 49370
16206 MOVK x1, 0x140, lsl 16
16207 MOVK x1, 0xc0da, lsl 32
16208 MOVK x1, 0x140, lsl 48
16209 STR x1, [x0]
16210 into:
16211 MOV w1, 49370
16212 MOVK w1, 0x140, lsl 16
16213 STP w1, w1, [x0]
16214 So we want to perform this only when we save two instructions
16215 or more. When optimizing for size, however, accept any code size
16216 savings we can. */
16217 if (size_p && orig_cost <= lo_cost)
16218 return false;
16219
16220 if (!size_p
16221 && (orig_cost <= lo_cost + 1))
16222 return false;
16223
16224 rtx mem_lo = adjust_address (dst, SImode, 0);
16225 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16226 return false;
16227
16228 rtx tmp_reg = gen_reg_rtx (SImode);
16229 aarch64_expand_mov_immediate (tmp_reg, lo);
16230 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16231 /* Don't emit an explicit store pair as this may not be always profitable.
16232 Let the sched-fusion logic decide whether to merge them. */
16233 emit_move_insn (mem_lo, tmp_reg);
16234 emit_move_insn (mem_hi, tmp_reg);
16235
16236 return true;
16237 }
16238
16239 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16240
16241 static unsigned HOST_WIDE_INT
16242 aarch64_asan_shadow_offset (void)
16243 {
16244 return (HOST_WIDE_INT_1 << 36);
16245 }
16246
16247 static rtx
16248 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16249 int code, tree treeop0, tree treeop1)
16250 {
16251 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16252 rtx op0, op1;
16253 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16254 insn_code icode;
16255 struct expand_operand ops[4];
16256
16257 start_sequence ();
16258 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16259
16260 op_mode = GET_MODE (op0);
16261 if (op_mode == VOIDmode)
16262 op_mode = GET_MODE (op1);
16263
16264 switch (op_mode)
16265 {
16266 case E_QImode:
16267 case E_HImode:
16268 case E_SImode:
16269 cmp_mode = SImode;
16270 icode = CODE_FOR_cmpsi;
16271 break;
16272
16273 case E_DImode:
16274 cmp_mode = DImode;
16275 icode = CODE_FOR_cmpdi;
16276 break;
16277
16278 case E_SFmode:
16279 cmp_mode = SFmode;
16280 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16281 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16282 break;
16283
16284 case E_DFmode:
16285 cmp_mode = DFmode;
16286 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16287 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16288 break;
16289
16290 default:
16291 end_sequence ();
16292 return NULL_RTX;
16293 }
16294
16295 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16296 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16297 if (!op0 || !op1)
16298 {
16299 end_sequence ();
16300 return NULL_RTX;
16301 }
16302 *prep_seq = get_insns ();
16303 end_sequence ();
16304
16305 create_fixed_operand (&ops[0], op0);
16306 create_fixed_operand (&ops[1], op1);
16307
16308 start_sequence ();
16309 if (!maybe_expand_insn (icode, 2, ops))
16310 {
16311 end_sequence ();
16312 return NULL_RTX;
16313 }
16314 *gen_seq = get_insns ();
16315 end_sequence ();
16316
16317 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16318 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16319 }
16320
16321 static rtx
16322 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16323 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16324 {
16325 rtx op0, op1, target;
16326 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16327 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16328 insn_code icode;
16329 struct expand_operand ops[6];
16330 int aarch64_cond;
16331
16332 push_to_sequence (*prep_seq);
16333 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16334
16335 op_mode = GET_MODE (op0);
16336 if (op_mode == VOIDmode)
16337 op_mode = GET_MODE (op1);
16338
16339 switch (op_mode)
16340 {
16341 case E_QImode:
16342 case E_HImode:
16343 case E_SImode:
16344 cmp_mode = SImode;
16345 icode = CODE_FOR_ccmpsi;
16346 break;
16347
16348 case E_DImode:
16349 cmp_mode = DImode;
16350 icode = CODE_FOR_ccmpdi;
16351 break;
16352
16353 case E_SFmode:
16354 cmp_mode = SFmode;
16355 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16356 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16357 break;
16358
16359 case E_DFmode:
16360 cmp_mode = DFmode;
16361 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16362 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16363 break;
16364
16365 default:
16366 end_sequence ();
16367 return NULL_RTX;
16368 }
16369
16370 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16371 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16372 if (!op0 || !op1)
16373 {
16374 end_sequence ();
16375 return NULL_RTX;
16376 }
16377 *prep_seq = get_insns ();
16378 end_sequence ();
16379
16380 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16381 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16382
16383 if (bit_code != AND)
16384 {
16385 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16386 GET_MODE (XEXP (prev, 0))),
16387 VOIDmode, XEXP (prev, 0), const0_rtx);
16388 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16389 }
16390
16391 create_fixed_operand (&ops[0], XEXP (prev, 0));
16392 create_fixed_operand (&ops[1], target);
16393 create_fixed_operand (&ops[2], op0);
16394 create_fixed_operand (&ops[3], op1);
16395 create_fixed_operand (&ops[4], prev);
16396 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16397
16398 push_to_sequence (*gen_seq);
16399 if (!maybe_expand_insn (icode, 6, ops))
16400 {
16401 end_sequence ();
16402 return NULL_RTX;
16403 }
16404
16405 *gen_seq = get_insns ();
16406 end_sequence ();
16407
16408 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16409 }
16410
16411 #undef TARGET_GEN_CCMP_FIRST
16412 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16413
16414 #undef TARGET_GEN_CCMP_NEXT
16415 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16416
16417 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16418 instruction fusion of some sort. */
16419
16420 static bool
16421 aarch64_macro_fusion_p (void)
16422 {
16423 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16424 }
16425
16426
16427 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16428 should be kept together during scheduling. */
16429
16430 static bool
16431 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16432 {
16433 rtx set_dest;
16434 rtx prev_set = single_set (prev);
16435 rtx curr_set = single_set (curr);
16436 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16437 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16438
16439 if (!aarch64_macro_fusion_p ())
16440 return false;
16441
16442 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16443 {
16444 /* We are trying to match:
16445 prev (mov) == (set (reg r0) (const_int imm16))
16446 curr (movk) == (set (zero_extract (reg r0)
16447 (const_int 16)
16448 (const_int 16))
16449 (const_int imm16_1)) */
16450
16451 set_dest = SET_DEST (curr_set);
16452
16453 if (GET_CODE (set_dest) == ZERO_EXTRACT
16454 && CONST_INT_P (SET_SRC (curr_set))
16455 && CONST_INT_P (SET_SRC (prev_set))
16456 && CONST_INT_P (XEXP (set_dest, 2))
16457 && INTVAL (XEXP (set_dest, 2)) == 16
16458 && REG_P (XEXP (set_dest, 0))
16459 && REG_P (SET_DEST (prev_set))
16460 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16461 {
16462 return true;
16463 }
16464 }
16465
16466 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16467 {
16468
16469 /* We're trying to match:
16470 prev (adrp) == (set (reg r1)
16471 (high (symbol_ref ("SYM"))))
16472 curr (add) == (set (reg r0)
16473 (lo_sum (reg r1)
16474 (symbol_ref ("SYM"))))
16475 Note that r0 need not necessarily be the same as r1, especially
16476 during pre-regalloc scheduling. */
16477
16478 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16479 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16480 {
16481 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16482 && REG_P (XEXP (SET_SRC (curr_set), 0))
16483 && REGNO (XEXP (SET_SRC (curr_set), 0))
16484 == REGNO (SET_DEST (prev_set))
16485 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16486 XEXP (SET_SRC (curr_set), 1)))
16487 return true;
16488 }
16489 }
16490
16491 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16492 {
16493
16494 /* We're trying to match:
16495 prev (movk) == (set (zero_extract (reg r0)
16496 (const_int 16)
16497 (const_int 32))
16498 (const_int imm16_1))
16499 curr (movk) == (set (zero_extract (reg r0)
16500 (const_int 16)
16501 (const_int 48))
16502 (const_int imm16_2)) */
16503
16504 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16505 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16506 && REG_P (XEXP (SET_DEST (prev_set), 0))
16507 && REG_P (XEXP (SET_DEST (curr_set), 0))
16508 && REGNO (XEXP (SET_DEST (prev_set), 0))
16509 == REGNO (XEXP (SET_DEST (curr_set), 0))
16510 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16511 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16512 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16513 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16514 && CONST_INT_P (SET_SRC (prev_set))
16515 && CONST_INT_P (SET_SRC (curr_set)))
16516 return true;
16517
16518 }
16519 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16520 {
16521 /* We're trying to match:
16522 prev (adrp) == (set (reg r0)
16523 (high (symbol_ref ("SYM"))))
16524 curr (ldr) == (set (reg r1)
16525 (mem (lo_sum (reg r0)
16526 (symbol_ref ("SYM")))))
16527 or
16528 curr (ldr) == (set (reg r1)
16529 (zero_extend (mem
16530 (lo_sum (reg r0)
16531 (symbol_ref ("SYM")))))) */
16532 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16533 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16534 {
16535 rtx curr_src = SET_SRC (curr_set);
16536
16537 if (GET_CODE (curr_src) == ZERO_EXTEND)
16538 curr_src = XEXP (curr_src, 0);
16539
16540 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16541 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16542 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16543 == REGNO (SET_DEST (prev_set))
16544 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16545 XEXP (SET_SRC (prev_set), 0)))
16546 return true;
16547 }
16548 }
16549
16550 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16551 && aarch_crypto_can_dual_issue (prev, curr))
16552 return true;
16553
16554 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16555 && any_condjump_p (curr))
16556 {
16557 enum attr_type prev_type = get_attr_type (prev);
16558
16559 unsigned int condreg1, condreg2;
16560 rtx cc_reg_1;
16561 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16562 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16563
16564 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16565 && prev
16566 && modified_in_p (cc_reg_1, prev))
16567 {
16568 /* FIXME: this misses some which is considered simple arthematic
16569 instructions for ThunderX. Simple shifts are missed here. */
16570 if (prev_type == TYPE_ALUS_SREG
16571 || prev_type == TYPE_ALUS_IMM
16572 || prev_type == TYPE_LOGICS_REG
16573 || prev_type == TYPE_LOGICS_IMM)
16574 return true;
16575 }
16576 }
16577
16578 if (prev_set
16579 && curr_set
16580 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16581 && any_condjump_p (curr))
16582 {
16583 /* We're trying to match:
16584 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16585 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16586 (const_int 0))
16587 (label_ref ("SYM"))
16588 (pc)) */
16589 if (SET_DEST (curr_set) == (pc_rtx)
16590 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16591 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16592 && REG_P (SET_DEST (prev_set))
16593 && REGNO (SET_DEST (prev_set))
16594 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16595 {
16596 /* Fuse ALU operations followed by conditional branch instruction. */
16597 switch (get_attr_type (prev))
16598 {
16599 case TYPE_ALU_IMM:
16600 case TYPE_ALU_SREG:
16601 case TYPE_ADC_REG:
16602 case TYPE_ADC_IMM:
16603 case TYPE_ADCS_REG:
16604 case TYPE_ADCS_IMM:
16605 case TYPE_LOGIC_REG:
16606 case TYPE_LOGIC_IMM:
16607 case TYPE_CSEL:
16608 case TYPE_ADR:
16609 case TYPE_MOV_IMM:
16610 case TYPE_SHIFT_REG:
16611 case TYPE_SHIFT_IMM:
16612 case TYPE_BFM:
16613 case TYPE_RBIT:
16614 case TYPE_REV:
16615 case TYPE_EXTEND:
16616 return true;
16617
16618 default:;
16619 }
16620 }
16621 }
16622
16623 return false;
16624 }
16625
16626 /* Return true iff the instruction fusion described by OP is enabled. */
16627
16628 bool
16629 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16630 {
16631 return (aarch64_tune_params.fusible_ops & op) != 0;
16632 }
16633
16634 /* If MEM is in the form of [base+offset], extract the two parts
16635 of address and set to BASE and OFFSET, otherwise return false
16636 after clearing BASE and OFFSET. */
16637
16638 bool
16639 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16640 {
16641 rtx addr;
16642
16643 gcc_assert (MEM_P (mem));
16644
16645 addr = XEXP (mem, 0);
16646
16647 if (REG_P (addr))
16648 {
16649 *base = addr;
16650 *offset = const0_rtx;
16651 return true;
16652 }
16653
16654 if (GET_CODE (addr) == PLUS
16655 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16656 {
16657 *base = XEXP (addr, 0);
16658 *offset = XEXP (addr, 1);
16659 return true;
16660 }
16661
16662 *base = NULL_RTX;
16663 *offset = NULL_RTX;
16664
16665 return false;
16666 }
16667
16668 /* Types for scheduling fusion. */
16669 enum sched_fusion_type
16670 {
16671 SCHED_FUSION_NONE = 0,
16672 SCHED_FUSION_LD_SIGN_EXTEND,
16673 SCHED_FUSION_LD_ZERO_EXTEND,
16674 SCHED_FUSION_LD,
16675 SCHED_FUSION_ST,
16676 SCHED_FUSION_NUM
16677 };
16678
16679 /* If INSN is a load or store of address in the form of [base+offset],
16680 extract the two parts and set to BASE and OFFSET. Return scheduling
16681 fusion type this INSN is. */
16682
16683 static enum sched_fusion_type
16684 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16685 {
16686 rtx x, dest, src;
16687 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16688
16689 gcc_assert (INSN_P (insn));
16690 x = PATTERN (insn);
16691 if (GET_CODE (x) != SET)
16692 return SCHED_FUSION_NONE;
16693
16694 src = SET_SRC (x);
16695 dest = SET_DEST (x);
16696
16697 machine_mode dest_mode = GET_MODE (dest);
16698
16699 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16700 return SCHED_FUSION_NONE;
16701
16702 if (GET_CODE (src) == SIGN_EXTEND)
16703 {
16704 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16705 src = XEXP (src, 0);
16706 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16707 return SCHED_FUSION_NONE;
16708 }
16709 else if (GET_CODE (src) == ZERO_EXTEND)
16710 {
16711 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16712 src = XEXP (src, 0);
16713 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16714 return SCHED_FUSION_NONE;
16715 }
16716
16717 if (GET_CODE (src) == MEM && REG_P (dest))
16718 extract_base_offset_in_addr (src, base, offset);
16719 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16720 {
16721 fusion = SCHED_FUSION_ST;
16722 extract_base_offset_in_addr (dest, base, offset);
16723 }
16724 else
16725 return SCHED_FUSION_NONE;
16726
16727 if (*base == NULL_RTX || *offset == NULL_RTX)
16728 fusion = SCHED_FUSION_NONE;
16729
16730 return fusion;
16731 }
16732
16733 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16734
16735 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16736 and PRI are only calculated for these instructions. For other instruction,
16737 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16738 type instruction fusion can be added by returning different priorities.
16739
16740 It's important that irrelevant instructions get the largest FUSION_PRI. */
16741
16742 static void
16743 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16744 int *fusion_pri, int *pri)
16745 {
16746 int tmp, off_val;
16747 rtx base, offset;
16748 enum sched_fusion_type fusion;
16749
16750 gcc_assert (INSN_P (insn));
16751
16752 tmp = max_pri - 1;
16753 fusion = fusion_load_store (insn, &base, &offset);
16754 if (fusion == SCHED_FUSION_NONE)
16755 {
16756 *pri = tmp;
16757 *fusion_pri = tmp;
16758 return;
16759 }
16760
16761 /* Set FUSION_PRI according to fusion type and base register. */
16762 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16763
16764 /* Calculate PRI. */
16765 tmp /= 2;
16766
16767 /* INSN with smaller offset goes first. */
16768 off_val = (int)(INTVAL (offset));
16769 if (off_val >= 0)
16770 tmp -= (off_val & 0xfffff);
16771 else
16772 tmp += ((- off_val) & 0xfffff);
16773
16774 *pri = tmp;
16775 return;
16776 }
16777
16778 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16779 Adjust priority of sha1h instructions so they are scheduled before
16780 other SHA1 instructions. */
16781
16782 static int
16783 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16784 {
16785 rtx x = PATTERN (insn);
16786
16787 if (GET_CODE (x) == SET)
16788 {
16789 x = SET_SRC (x);
16790
16791 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16792 return priority + 10;
16793 }
16794
16795 return priority;
16796 }
16797
16798 /* Given OPERANDS of consecutive load/store, check if we can merge
16799 them into ldp/stp. LOAD is true if they are load instructions.
16800 MODE is the mode of memory operands. */
16801
16802 bool
16803 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16804 machine_mode mode)
16805 {
16806 HOST_WIDE_INT offval_1, offval_2, msize;
16807 enum reg_class rclass_1, rclass_2;
16808 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16809
16810 if (load)
16811 {
16812 mem_1 = operands[1];
16813 mem_2 = operands[3];
16814 reg_1 = operands[0];
16815 reg_2 = operands[2];
16816 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16817 if (REGNO (reg_1) == REGNO (reg_2))
16818 return false;
16819 }
16820 else
16821 {
16822 mem_1 = operands[0];
16823 mem_2 = operands[2];
16824 reg_1 = operands[1];
16825 reg_2 = operands[3];
16826 }
16827
16828 /* The mems cannot be volatile. */
16829 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16830 return false;
16831
16832 /* If we have SImode and slow unaligned ldp,
16833 check the alignment to be at least 8 byte. */
16834 if (mode == SImode
16835 && (aarch64_tune_params.extra_tuning_flags
16836 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16837 && !optimize_size
16838 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16839 return false;
16840
16841 /* Check if the addresses are in the form of [base+offset]. */
16842 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16843 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16844 return false;
16845 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16846 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16847 return false;
16848
16849 /* Check if the bases are same. */
16850 if (!rtx_equal_p (base_1, base_2))
16851 return false;
16852
16853 /* The operands must be of the same size. */
16854 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16855 GET_MODE_SIZE (GET_MODE (mem_2))));
16856
16857 offval_1 = INTVAL (offset_1);
16858 offval_2 = INTVAL (offset_2);
16859 /* We should only be trying this for fixed-sized modes. There is no
16860 SVE LDP/STP instruction. */
16861 msize = GET_MODE_SIZE (mode).to_constant ();
16862 /* Check if the offsets are consecutive. */
16863 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16864 return false;
16865
16866 /* Check if the addresses are clobbered by load. */
16867 if (load)
16868 {
16869 if (reg_mentioned_p (reg_1, mem_1))
16870 return false;
16871
16872 /* In increasing order, the last load can clobber the address. */
16873 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16874 return false;
16875 }
16876
16877 /* One of the memory accesses must be a mempair operand.
16878 If it is not the first one, they need to be swapped by the
16879 peephole. */
16880 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16881 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16882 return false;
16883
16884 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16885 rclass_1 = FP_REGS;
16886 else
16887 rclass_1 = GENERAL_REGS;
16888
16889 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16890 rclass_2 = FP_REGS;
16891 else
16892 rclass_2 = GENERAL_REGS;
16893
16894 /* Check if the registers are of same class. */
16895 if (rclass_1 != rclass_2)
16896 return false;
16897
16898 return true;
16899 }
16900
16901 /* Given OPERANDS of consecutive load/store that can be merged,
16902 swap them if they are not in ascending order. */
16903 void
16904 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16905 {
16906 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16907 HOST_WIDE_INT offval_1, offval_2;
16908
16909 if (load)
16910 {
16911 mem_1 = operands[1];
16912 mem_2 = operands[3];
16913 }
16914 else
16915 {
16916 mem_1 = operands[0];
16917 mem_2 = operands[2];
16918 }
16919
16920 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16921 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16922
16923 offval_1 = INTVAL (offset_1);
16924 offval_2 = INTVAL (offset_2);
16925
16926 if (offval_1 > offval_2)
16927 {
16928 /* Irrespective of whether this is a load or a store,
16929 we do the same swap. */
16930 std::swap (operands[0], operands[2]);
16931 std::swap (operands[1], operands[3]);
16932 }
16933 }
16934
16935 /* Given OPERANDS of consecutive load/store, check if we can merge
16936 them into ldp/stp by adjusting the offset. LOAD is true if they
16937 are load instructions. MODE is the mode of memory operands.
16938
16939 Given below consecutive stores:
16940
16941 str w1, [xb, 0x100]
16942 str w1, [xb, 0x104]
16943 str w1, [xb, 0x108]
16944 str w1, [xb, 0x10c]
16945
16946 Though the offsets are out of the range supported by stp, we can
16947 still pair them after adjusting the offset, like:
16948
16949 add scratch, xb, 0x100
16950 stp w1, w1, [scratch]
16951 stp w1, w1, [scratch, 0x8]
16952
16953 The peephole patterns detecting this opportunity should guarantee
16954 the scratch register is avaliable. */
16955
16956 bool
16957 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16958 scalar_mode mode)
16959 {
16960 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16961 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16962 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16963 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16964
16965 if (load)
16966 {
16967 reg_1 = operands[0];
16968 mem_1 = operands[1];
16969 reg_2 = operands[2];
16970 mem_2 = operands[3];
16971 reg_3 = operands[4];
16972 mem_3 = operands[5];
16973 reg_4 = operands[6];
16974 mem_4 = operands[7];
16975 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16976 && REG_P (reg_3) && REG_P (reg_4));
16977 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16978 return false;
16979 }
16980 else
16981 {
16982 mem_1 = operands[0];
16983 reg_1 = operands[1];
16984 mem_2 = operands[2];
16985 reg_2 = operands[3];
16986 mem_3 = operands[4];
16987 reg_3 = operands[5];
16988 mem_4 = operands[6];
16989 reg_4 = operands[7];
16990 }
16991 /* Skip if memory operand is by itslef valid for ldp/stp. */
16992 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16993 return false;
16994
16995 /* The mems cannot be volatile. */
16996 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16997 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16998 return false;
16999
17000 /* Check if the addresses are in the form of [base+offset]. */
17001 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17002 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17003 return false;
17004 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17005 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17006 return false;
17007 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17008 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17009 return false;
17010 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17011 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17012 return false;
17013
17014 /* Check if the bases are same. */
17015 if (!rtx_equal_p (base_1, base_2)
17016 || !rtx_equal_p (base_2, base_3)
17017 || !rtx_equal_p (base_3, base_4))
17018 return false;
17019
17020 offval_1 = INTVAL (offset_1);
17021 offval_2 = INTVAL (offset_2);
17022 offval_3 = INTVAL (offset_3);
17023 offval_4 = INTVAL (offset_4);
17024 msize = GET_MODE_SIZE (mode);
17025 /* Check if the offsets are consecutive. */
17026 if ((offval_1 != (offval_2 + msize)
17027 || offval_1 != (offval_3 + msize * 2)
17028 || offval_1 != (offval_4 + msize * 3))
17029 && (offval_4 != (offval_3 + msize)
17030 || offval_4 != (offval_2 + msize * 2)
17031 || offval_4 != (offval_1 + msize * 3)))
17032 return false;
17033
17034 /* Check if the addresses are clobbered by load. */
17035 if (load)
17036 {
17037 if (reg_mentioned_p (reg_1, mem_1)
17038 || reg_mentioned_p (reg_2, mem_2)
17039 || reg_mentioned_p (reg_3, mem_3))
17040 return false;
17041
17042 /* In increasing order, the last load can clobber the address. */
17043 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
17044 return false;
17045 }
17046
17047 /* If we have SImode and slow unaligned ldp,
17048 check the alignment to be at least 8 byte. */
17049 if (mode == SImode
17050 && (aarch64_tune_params.extra_tuning_flags
17051 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17052 && !optimize_size
17053 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17054 return false;
17055
17056 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17057 rclass_1 = FP_REGS;
17058 else
17059 rclass_1 = GENERAL_REGS;
17060
17061 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17062 rclass_2 = FP_REGS;
17063 else
17064 rclass_2 = GENERAL_REGS;
17065
17066 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17067 rclass_3 = FP_REGS;
17068 else
17069 rclass_3 = GENERAL_REGS;
17070
17071 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17072 rclass_4 = FP_REGS;
17073 else
17074 rclass_4 = GENERAL_REGS;
17075
17076 /* Check if the registers are of same class. */
17077 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17078 return false;
17079
17080 return true;
17081 }
17082
17083 /* Given OPERANDS of consecutive load/store, this function pairs them
17084 into ldp/stp after adjusting the offset. It depends on the fact
17085 that addresses of load/store instructions are in increasing order.
17086 MODE is the mode of memory operands. CODE is the rtl operator
17087 which should be applied to all memory operands, it's SIGN_EXTEND,
17088 ZERO_EXTEND or UNKNOWN. */
17089
17090 bool
17091 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17092 scalar_mode mode, RTX_CODE code)
17093 {
17094 rtx base, offset_1, offset_2, t1, t2;
17095 rtx mem_1, mem_2, mem_3, mem_4;
17096 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
17097
17098 if (load)
17099 {
17100 mem_1 = operands[1];
17101 mem_2 = operands[3];
17102 }
17103 else
17104 {
17105 mem_1 = operands[0];
17106 mem_2 = operands[2];
17107 }
17108
17109 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17110 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17111 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17112 && offset_2 != NULL_RTX);
17113
17114 if (INTVAL (offset_1) > INTVAL (offset_2))
17115 {
17116 std::swap (operands[0], operands[6]);
17117 std::swap (operands[1], operands[7]);
17118 std::swap (operands[2], operands[4]);
17119 std::swap (operands[3], operands[5]);
17120 }
17121
17122 if (load)
17123 {
17124 mem_1 = operands[1];
17125 mem_2 = operands[3];
17126 mem_3 = operands[5];
17127 mem_4 = operands[7];
17128 }
17129 else
17130 {
17131 mem_1 = operands[0];
17132 mem_2 = operands[2];
17133 mem_3 = operands[4];
17134 mem_4 = operands[6];
17135 gcc_assert (code == UNKNOWN);
17136 }
17137
17138 /* Extract the offset of the new first address. */
17139 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17140 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17141
17142 /* Adjust offset thus it can fit in ldp/stp instruction. */
17143 msize = GET_MODE_SIZE (mode);
17144 stp_off_limit = msize * 0x40;
17145 off_val = INTVAL (offset_1);
17146 abs_off = (off_val < 0) ? -off_val : off_val;
17147 new_off = abs_off % stp_off_limit;
17148 adj_off = abs_off - new_off;
17149
17150 /* Further adjust to make sure all offsets are OK. */
17151 if ((new_off + msize * 2) >= stp_off_limit)
17152 {
17153 adj_off += stp_off_limit;
17154 new_off -= stp_off_limit;
17155 }
17156
17157 /* Make sure the adjustment can be done with ADD/SUB instructions. */
17158 if (adj_off >= 0x1000)
17159 return false;
17160
17161 if (off_val < 0)
17162 {
17163 adj_off = -adj_off;
17164 new_off = -new_off;
17165 }
17166
17167 /* Create new memory references. */
17168 mem_1 = change_address (mem_1, VOIDmode,
17169 plus_constant (DImode, operands[8], new_off));
17170
17171 /* Check if the adjusted address is OK for ldp/stp. */
17172 if (!aarch64_mem_pair_operand (mem_1, mode))
17173 return false;
17174
17175 msize = GET_MODE_SIZE (mode);
17176 mem_2 = change_address (mem_2, VOIDmode,
17177 plus_constant (DImode,
17178 operands[8],
17179 new_off + msize));
17180 mem_3 = change_address (mem_3, VOIDmode,
17181 plus_constant (DImode,
17182 operands[8],
17183 new_off + msize * 2));
17184 mem_4 = change_address (mem_4, VOIDmode,
17185 plus_constant (DImode,
17186 operands[8],
17187 new_off + msize * 3));
17188
17189 if (code == ZERO_EXTEND)
17190 {
17191 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17192 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17193 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17194 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17195 }
17196 else if (code == SIGN_EXTEND)
17197 {
17198 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17199 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17200 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17201 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17202 }
17203
17204 if (load)
17205 {
17206 operands[1] = mem_1;
17207 operands[3] = mem_2;
17208 operands[5] = mem_3;
17209 operands[7] = mem_4;
17210 }
17211 else
17212 {
17213 operands[0] = mem_1;
17214 operands[2] = mem_2;
17215 operands[4] = mem_3;
17216 operands[6] = mem_4;
17217 }
17218
17219 /* Emit adjusting instruction. */
17220 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17221 /* Emit ldp/stp instructions. */
17222 t1 = gen_rtx_SET (operands[0], operands[1]);
17223 t2 = gen_rtx_SET (operands[2], operands[3]);
17224 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17225 t1 = gen_rtx_SET (operands[4], operands[5]);
17226 t2 = gen_rtx_SET (operands[6], operands[7]);
17227 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17228 return true;
17229 }
17230
17231 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17232 it isn't worth branching around empty masked ops (including masked
17233 stores). */
17234
17235 static bool
17236 aarch64_empty_mask_is_expensive (unsigned)
17237 {
17238 return false;
17239 }
17240
17241 /* Return 1 if pseudo register should be created and used to hold
17242 GOT address for PIC code. */
17243
17244 bool
17245 aarch64_use_pseudo_pic_reg (void)
17246 {
17247 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17248 }
17249
17250 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17251
17252 static int
17253 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17254 {
17255 switch (XINT (x, 1))
17256 {
17257 case UNSPEC_GOTSMALLPIC:
17258 case UNSPEC_GOTSMALLPIC28K:
17259 case UNSPEC_GOTTINYPIC:
17260 return 0;
17261 default:
17262 break;
17263 }
17264
17265 return default_unspec_may_trap_p (x, flags);
17266 }
17267
17268
17269 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17270 return the log2 of that value. Otherwise return -1. */
17271
17272 int
17273 aarch64_fpconst_pow_of_2 (rtx x)
17274 {
17275 const REAL_VALUE_TYPE *r;
17276
17277 if (!CONST_DOUBLE_P (x))
17278 return -1;
17279
17280 r = CONST_DOUBLE_REAL_VALUE (x);
17281
17282 if (REAL_VALUE_NEGATIVE (*r)
17283 || REAL_VALUE_ISNAN (*r)
17284 || REAL_VALUE_ISINF (*r)
17285 || !real_isinteger (r, DFmode))
17286 return -1;
17287
17288 return exact_log2 (real_to_integer (r));
17289 }
17290
17291 /* If X is a vector of equal CONST_DOUBLE values and that value is
17292 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17293
17294 int
17295 aarch64_vec_fpconst_pow_of_2 (rtx x)
17296 {
17297 int nelts;
17298 if (GET_CODE (x) != CONST_VECTOR
17299 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17300 return -1;
17301
17302 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17303 return -1;
17304
17305 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17306 if (firstval <= 0)
17307 return -1;
17308
17309 for (int i = 1; i < nelts; i++)
17310 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17311 return -1;
17312
17313 return firstval;
17314 }
17315
17316 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17317 to float.
17318
17319 __fp16 always promotes through this hook.
17320 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17321 through the generic excess precision logic rather than here. */
17322
17323 static tree
17324 aarch64_promoted_type (const_tree t)
17325 {
17326 if (SCALAR_FLOAT_TYPE_P (t)
17327 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17328 return float_type_node;
17329
17330 return NULL_TREE;
17331 }
17332
17333 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17334
17335 static bool
17336 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17337 optimization_type opt_type)
17338 {
17339 switch (op)
17340 {
17341 case rsqrt_optab:
17342 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17343
17344 default:
17345 return true;
17346 }
17347 }
17348
17349 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17350
17351 static unsigned int
17352 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17353 int *offset)
17354 {
17355 /* Polynomial invariant 1 == (VG / 2) - 1. */
17356 gcc_assert (i == 1);
17357 *factor = 2;
17358 *offset = 1;
17359 return AARCH64_DWARF_VG;
17360 }
17361
17362 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17363 if MODE is HFmode, and punt to the generic implementation otherwise. */
17364
17365 static bool
17366 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17367 {
17368 return (mode == HFmode
17369 ? true
17370 : default_libgcc_floating_mode_supported_p (mode));
17371 }
17372
17373 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17374 if MODE is HFmode, and punt to the generic implementation otherwise. */
17375
17376 static bool
17377 aarch64_scalar_mode_supported_p (scalar_mode mode)
17378 {
17379 return (mode == HFmode
17380 ? true
17381 : default_scalar_mode_supported_p (mode));
17382 }
17383
17384 /* Set the value of FLT_EVAL_METHOD.
17385 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17386
17387 0: evaluate all operations and constants, whose semantic type has at
17388 most the range and precision of type float, to the range and
17389 precision of float; evaluate all other operations and constants to
17390 the range and precision of the semantic type;
17391
17392 N, where _FloatN is a supported interchange floating type
17393 evaluate all operations and constants, whose semantic type has at
17394 most the range and precision of _FloatN type, to the range and
17395 precision of the _FloatN type; evaluate all other operations and
17396 constants to the range and precision of the semantic type;
17397
17398 If we have the ARMv8.2-A extensions then we support _Float16 in native
17399 precision, so we should set this to 16. Otherwise, we support the type,
17400 but want to evaluate expressions in float precision, so set this to
17401 0. */
17402
17403 static enum flt_eval_method
17404 aarch64_excess_precision (enum excess_precision_type type)
17405 {
17406 switch (type)
17407 {
17408 case EXCESS_PRECISION_TYPE_FAST:
17409 case EXCESS_PRECISION_TYPE_STANDARD:
17410 /* We can calculate either in 16-bit range and precision or
17411 32-bit range and precision. Make that decision based on whether
17412 we have native support for the ARMv8.2-A 16-bit floating-point
17413 instructions or not. */
17414 return (TARGET_FP_F16INST
17415 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17416 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17417 case EXCESS_PRECISION_TYPE_IMPLICIT:
17418 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17419 default:
17420 gcc_unreachable ();
17421 }
17422 return FLT_EVAL_METHOD_UNPREDICTABLE;
17423 }
17424
17425 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17426 scheduled for speculative execution. Reject the long-running division
17427 and square-root instructions. */
17428
17429 static bool
17430 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17431 {
17432 switch (get_attr_type (insn))
17433 {
17434 case TYPE_SDIV:
17435 case TYPE_UDIV:
17436 case TYPE_FDIVS:
17437 case TYPE_FDIVD:
17438 case TYPE_FSQRTS:
17439 case TYPE_FSQRTD:
17440 case TYPE_NEON_FP_SQRT_S:
17441 case TYPE_NEON_FP_SQRT_D:
17442 case TYPE_NEON_FP_SQRT_S_Q:
17443 case TYPE_NEON_FP_SQRT_D_Q:
17444 case TYPE_NEON_FP_DIV_S:
17445 case TYPE_NEON_FP_DIV_D:
17446 case TYPE_NEON_FP_DIV_S_Q:
17447 case TYPE_NEON_FP_DIV_D_Q:
17448 return false;
17449 default:
17450 return true;
17451 }
17452 }
17453
17454 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17455
17456 static int
17457 aarch64_compute_pressure_classes (reg_class *classes)
17458 {
17459 int i = 0;
17460 classes[i++] = GENERAL_REGS;
17461 classes[i++] = FP_REGS;
17462 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17463 registers need to go in PR_LO_REGS at some point during their
17464 lifetime. Splitting it into two halves has the effect of making
17465 all predicates count against PR_LO_REGS, so that we try whenever
17466 possible to restrict the number of live predicates to 8. This
17467 greatly reduces the amount of spilling in certain loops. */
17468 classes[i++] = PR_LO_REGS;
17469 classes[i++] = PR_HI_REGS;
17470 return i;
17471 }
17472
17473 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17474
17475 static bool
17476 aarch64_can_change_mode_class (machine_mode from,
17477 machine_mode to, reg_class_t)
17478 {
17479 if (BYTES_BIG_ENDIAN)
17480 {
17481 bool from_sve_p = aarch64_sve_data_mode_p (from);
17482 bool to_sve_p = aarch64_sve_data_mode_p (to);
17483
17484 /* Don't allow changes between SVE data modes and non-SVE modes.
17485 See the comment at the head of aarch64-sve.md for details. */
17486 if (from_sve_p != to_sve_p)
17487 return false;
17488
17489 /* Don't allow changes in element size: lane 0 of the new vector
17490 would not then be lane 0 of the old vector. See the comment
17491 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17492 description.
17493
17494 In the worst case, this forces a register to be spilled in
17495 one mode and reloaded in the other, which handles the
17496 endianness correctly. */
17497 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17498 return false;
17499 }
17500 return true;
17501 }
17502
17503 /* Implement TARGET_EARLY_REMAT_MODES. */
17504
17505 static void
17506 aarch64_select_early_remat_modes (sbitmap modes)
17507 {
17508 /* SVE values are not normally live across a call, so it should be
17509 worth doing early rematerialization even in VL-specific mode. */
17510 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17511 {
17512 machine_mode mode = (machine_mode) i;
17513 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17514 if (vec_flags & VEC_ANY_SVE)
17515 bitmap_set_bit (modes, i);
17516 }
17517 }
17518
17519 /* Target-specific selftests. */
17520
17521 #if CHECKING_P
17522
17523 namespace selftest {
17524
17525 /* Selftest for the RTL loader.
17526 Verify that the RTL loader copes with a dump from
17527 print_rtx_function. This is essentially just a test that class
17528 function_reader can handle a real dump, but it also verifies
17529 that lookup_reg_by_dump_name correctly handles hard regs.
17530 The presence of hard reg names in the dump means that the test is
17531 target-specific, hence it is in this file. */
17532
17533 static void
17534 aarch64_test_loading_full_dump ()
17535 {
17536 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17537
17538 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17539
17540 rtx_insn *insn_1 = get_insn_by_uid (1);
17541 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17542
17543 rtx_insn *insn_15 = get_insn_by_uid (15);
17544 ASSERT_EQ (INSN, GET_CODE (insn_15));
17545 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17546
17547 /* Verify crtl->return_rtx. */
17548 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17549 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17550 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17551 }
17552
17553 /* Run all target-specific selftests. */
17554
17555 static void
17556 aarch64_run_selftests (void)
17557 {
17558 aarch64_test_loading_full_dump ();
17559 }
17560
17561 } // namespace selftest
17562
17563 #endif /* #if CHECKING_P */
17564
17565 #undef TARGET_ADDRESS_COST
17566 #define TARGET_ADDRESS_COST aarch64_address_cost
17567
17568 /* This hook will determines whether unnamed bitfields affect the alignment
17569 of the containing structure. The hook returns true if the structure
17570 should inherit the alignment requirements of an unnamed bitfield's
17571 type. */
17572 #undef TARGET_ALIGN_ANON_BITFIELD
17573 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17574
17575 #undef TARGET_ASM_ALIGNED_DI_OP
17576 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17577
17578 #undef TARGET_ASM_ALIGNED_HI_OP
17579 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17580
17581 #undef TARGET_ASM_ALIGNED_SI_OP
17582 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17583
17584 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17585 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17586 hook_bool_const_tree_hwi_hwi_const_tree_true
17587
17588 #undef TARGET_ASM_FILE_START
17589 #define TARGET_ASM_FILE_START aarch64_start_file
17590
17591 #undef TARGET_ASM_OUTPUT_MI_THUNK
17592 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17593
17594 #undef TARGET_ASM_SELECT_RTX_SECTION
17595 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17596
17597 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17598 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17599
17600 #undef TARGET_BUILD_BUILTIN_VA_LIST
17601 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17602
17603 #undef TARGET_CALLEE_COPIES
17604 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17605
17606 #undef TARGET_CAN_ELIMINATE
17607 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17608
17609 #undef TARGET_CAN_INLINE_P
17610 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17611
17612 #undef TARGET_CANNOT_FORCE_CONST_MEM
17613 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17614
17615 #undef TARGET_CASE_VALUES_THRESHOLD
17616 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17617
17618 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17619 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17620
17621 /* Only the least significant bit is used for initialization guard
17622 variables. */
17623 #undef TARGET_CXX_GUARD_MASK_BIT
17624 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17625
17626 #undef TARGET_C_MODE_FOR_SUFFIX
17627 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17628
17629 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17630 #undef TARGET_DEFAULT_TARGET_FLAGS
17631 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17632 #endif
17633
17634 #undef TARGET_CLASS_MAX_NREGS
17635 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17636
17637 #undef TARGET_BUILTIN_DECL
17638 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17639
17640 #undef TARGET_BUILTIN_RECIPROCAL
17641 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17642
17643 #undef TARGET_C_EXCESS_PRECISION
17644 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17645
17646 #undef TARGET_EXPAND_BUILTIN
17647 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17648
17649 #undef TARGET_EXPAND_BUILTIN_VA_START
17650 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17651
17652 #undef TARGET_FOLD_BUILTIN
17653 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17654
17655 #undef TARGET_FUNCTION_ARG
17656 #define TARGET_FUNCTION_ARG aarch64_function_arg
17657
17658 #undef TARGET_FUNCTION_ARG_ADVANCE
17659 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17660
17661 #undef TARGET_FUNCTION_ARG_BOUNDARY
17662 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17663
17664 #undef TARGET_FUNCTION_ARG_PADDING
17665 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17666
17667 #undef TARGET_GET_RAW_RESULT_MODE
17668 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17669 #undef TARGET_GET_RAW_ARG_MODE
17670 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17671
17672 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17673 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17674
17675 #undef TARGET_FUNCTION_VALUE
17676 #define TARGET_FUNCTION_VALUE aarch64_function_value
17677
17678 #undef TARGET_FUNCTION_VALUE_REGNO_P
17679 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17680
17681 #undef TARGET_GIMPLE_FOLD_BUILTIN
17682 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17683
17684 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17685 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17686
17687 #undef TARGET_INIT_BUILTINS
17688 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17689
17690 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17691 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17692 aarch64_ira_change_pseudo_allocno_class
17693
17694 #undef TARGET_LEGITIMATE_ADDRESS_P
17695 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17696
17697 #undef TARGET_LEGITIMATE_CONSTANT_P
17698 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17699
17700 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17701 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17702 aarch64_legitimize_address_displacement
17703
17704 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17705 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17706
17707 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17708 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17709 aarch64_libgcc_floating_mode_supported_p
17710
17711 #undef TARGET_MANGLE_TYPE
17712 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17713
17714 #undef TARGET_MEMORY_MOVE_COST
17715 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17716
17717 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17718 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17719
17720 #undef TARGET_MUST_PASS_IN_STACK
17721 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17722
17723 /* This target hook should return true if accesses to volatile bitfields
17724 should use the narrowest mode possible. It should return false if these
17725 accesses should use the bitfield container type. */
17726 #undef TARGET_NARROW_VOLATILE_BITFIELD
17727 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17728
17729 #undef TARGET_OPTION_OVERRIDE
17730 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17731
17732 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17733 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17734 aarch64_override_options_after_change
17735
17736 #undef TARGET_OPTION_SAVE
17737 #define TARGET_OPTION_SAVE aarch64_option_save
17738
17739 #undef TARGET_OPTION_RESTORE
17740 #define TARGET_OPTION_RESTORE aarch64_option_restore
17741
17742 #undef TARGET_OPTION_PRINT
17743 #define TARGET_OPTION_PRINT aarch64_option_print
17744
17745 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17746 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17747
17748 #undef TARGET_SET_CURRENT_FUNCTION
17749 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17750
17751 #undef TARGET_PASS_BY_REFERENCE
17752 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17753
17754 #undef TARGET_PREFERRED_RELOAD_CLASS
17755 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17756
17757 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17758 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17759
17760 #undef TARGET_PROMOTED_TYPE
17761 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17762
17763 #undef TARGET_SECONDARY_RELOAD
17764 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17765
17766 #undef TARGET_SHIFT_TRUNCATION_MASK
17767 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17768
17769 #undef TARGET_SETUP_INCOMING_VARARGS
17770 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17771
17772 #undef TARGET_STRUCT_VALUE_RTX
17773 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17774
17775 #undef TARGET_REGISTER_MOVE_COST
17776 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17777
17778 #undef TARGET_RETURN_IN_MEMORY
17779 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17780
17781 #undef TARGET_RETURN_IN_MSB
17782 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17783
17784 #undef TARGET_RTX_COSTS
17785 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17786
17787 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17788 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17789
17790 #undef TARGET_SCHED_ISSUE_RATE
17791 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17792
17793 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17794 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17795 aarch64_sched_first_cycle_multipass_dfa_lookahead
17796
17797 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17798 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17799 aarch64_first_cycle_multipass_dfa_lookahead_guard
17800
17801 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17802 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17803 aarch64_get_separate_components
17804
17805 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17806 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17807 aarch64_components_for_bb
17808
17809 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17810 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17811 aarch64_disqualify_components
17812
17813 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17814 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17815 aarch64_emit_prologue_components
17816
17817 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17818 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17819 aarch64_emit_epilogue_components
17820
17821 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17822 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17823 aarch64_set_handled_components
17824
17825 #undef TARGET_TRAMPOLINE_INIT
17826 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17827
17828 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17829 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17830
17831 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17832 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17833
17834 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17835 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17836 aarch64_builtin_support_vector_misalignment
17837
17838 #undef TARGET_ARRAY_MODE
17839 #define TARGET_ARRAY_MODE aarch64_array_mode
17840
17841 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17842 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17843
17844 #undef TARGET_VECTORIZE_ADD_STMT_COST
17845 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17846
17847 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17848 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17849 aarch64_builtin_vectorization_cost
17850
17851 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17852 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17853
17854 #undef TARGET_VECTORIZE_BUILTINS
17855 #define TARGET_VECTORIZE_BUILTINS
17856
17857 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17858 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17859 aarch64_builtin_vectorized_function
17860
17861 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17862 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17863 aarch64_autovectorize_vector_sizes
17864
17865 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17866 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17867 aarch64_atomic_assign_expand_fenv
17868
17869 /* Section anchor support. */
17870
17871 #undef TARGET_MIN_ANCHOR_OFFSET
17872 #define TARGET_MIN_ANCHOR_OFFSET -256
17873
17874 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17875 byte offset; we can do much more for larger data types, but have no way
17876 to determine the size of the access. We assume accesses are aligned. */
17877 #undef TARGET_MAX_ANCHOR_OFFSET
17878 #define TARGET_MAX_ANCHOR_OFFSET 4095
17879
17880 #undef TARGET_VECTOR_ALIGNMENT
17881 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17882
17883 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17884 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17885 aarch64_vectorize_preferred_vector_alignment
17886 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17887 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17888 aarch64_simd_vector_alignment_reachable
17889
17890 /* vec_perm support. */
17891
17892 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17893 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17894 aarch64_vectorize_vec_perm_const
17895
17896 #undef TARGET_VECTORIZE_GET_MASK_MODE
17897 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17898 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17899 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17900 aarch64_empty_mask_is_expensive
17901
17902 #undef TARGET_INIT_LIBFUNCS
17903 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17904
17905 #undef TARGET_FIXED_CONDITION_CODE_REGS
17906 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17907
17908 #undef TARGET_FLAGS_REGNUM
17909 #define TARGET_FLAGS_REGNUM CC_REGNUM
17910
17911 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17912 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17913
17914 #undef TARGET_ASAN_SHADOW_OFFSET
17915 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17916
17917 #undef TARGET_LEGITIMIZE_ADDRESS
17918 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17919
17920 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17921 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17922
17923 #undef TARGET_CAN_USE_DOLOOP_P
17924 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17925
17926 #undef TARGET_SCHED_ADJUST_PRIORITY
17927 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17928
17929 #undef TARGET_SCHED_MACRO_FUSION_P
17930 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17931
17932 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17933 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17934
17935 #undef TARGET_SCHED_FUSION_PRIORITY
17936 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17937
17938 #undef TARGET_UNSPEC_MAY_TRAP_P
17939 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17940
17941 #undef TARGET_USE_PSEUDO_PIC_REG
17942 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17943
17944 #undef TARGET_PRINT_OPERAND
17945 #define TARGET_PRINT_OPERAND aarch64_print_operand
17946
17947 #undef TARGET_PRINT_OPERAND_ADDRESS
17948 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17949
17950 #undef TARGET_OPTAB_SUPPORTED_P
17951 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17952
17953 #undef TARGET_OMIT_STRUCT_RETURN_REG
17954 #define TARGET_OMIT_STRUCT_RETURN_REG true
17955
17956 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17957 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17958 aarch64_dwarf_poly_indeterminate_value
17959
17960 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17961 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17962 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17963
17964 #undef TARGET_HARD_REGNO_NREGS
17965 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17966 #undef TARGET_HARD_REGNO_MODE_OK
17967 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17968
17969 #undef TARGET_MODES_TIEABLE_P
17970 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17971
17972 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17973 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17974 aarch64_hard_regno_call_part_clobbered
17975
17976 #undef TARGET_CONSTANT_ALIGNMENT
17977 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17978
17979 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17980 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17981
17982 #undef TARGET_CAN_CHANGE_MODE_CLASS
17983 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17984
17985 #undef TARGET_SELECT_EARLY_REMAT_MODES
17986 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17987
17988 #if CHECKING_P
17989 #undef TARGET_RUN_TARGET_SELFTESTS
17990 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17991 #endif /* #if CHECKING_P */
17992
17993 struct gcc_target targetm = TARGET_INITIALIZER;
17994
17995 #include "gt-aarch64.h"
This page took 0.7957 seconds and 4 git commands to generate.