gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Global flag for whether frame pointer is enabled.  */
 224 bool aarch64_use_frame_pointer;
 225
 226 /* Support for command line parsing of boolean flags in the tuning
 227    structures.  */
 228 struct aarch64_flag_desc
 229 {
 230   const char* name;
 231   unsigned int flag;
 232 };
 233
 234 #define AARCH64_FUSION_PAIR(name, internal_name) \
 235   { name, AARCH64_FUSE_##internal_name },
 236 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 237 {
 238   { "none", AARCH64_FUSE_NOTHING },
 239 #include "aarch64-fusion-pairs.def"
 240   { "all", AARCH64_FUSE_ALL },
 241   { NULL, AARCH64_FUSE_NOTHING }
 242 };
 243
 244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 245   { name, AARCH64_EXTRA_TUNE_##internal_name },
 246 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 247 {
 248   { "none", AARCH64_EXTRA_TUNE_NONE },
 249 #include "aarch64-tuning-flags.def"
 250   { "all", AARCH64_EXTRA_TUNE_ALL },
 251   { NULL, AARCH64_EXTRA_TUNE_NONE }
 252 };
 253
 254 /* Tuning parameters.  */
 255
 256 static const struct cpu_addrcost_table generic_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   0, /* pre_modify  */
 265   0, /* post_modify  */
 266   0, /* register_offset  */
 267   0, /* register_sextend  */
 268   0, /* register_zextend  */
 269   0 /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 273 {
 274     {
 275       0, /* hi  */
 276       0, /* si  */
 277       0, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   1, /* register_offset  */
 283   1, /* register_sextend  */
 284   2, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table xgene1_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   1, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   0, /* pre_modify  */
 313   0, /* post_modify  */
 314   2, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   0, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_regmove_cost generic_regmove_cost =
 321 {
 322   1, /* GP2GP  */
 323   /* Avoid the use of slow int<->fp moves for spilling by setting
 324      their cost higher than memmov_cost.  */
 325   5, /* GP2FP  */
 326   5, /* FP2GP  */
 327   2 /* FP2FP  */
 328 };
 329
 330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 331 {
 332   1, /* GP2GP  */
 333   /* Avoid the use of slow int<->fp moves for spilling by setting
 334      their cost higher than memmov_cost.  */
 335   5, /* GP2FP  */
 336   5, /* FP2GP  */
 337   2 /* FP2FP  */
 338 };
 339
 340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 341 {
 342   1, /* GP2GP  */
 343   /* Avoid the use of slow int<->fp moves for spilling by setting
 344      their cost higher than memmov_cost.  */
 345   5, /* GP2FP  */
 346   5, /* FP2GP  */
 347   2 /* FP2FP  */
 348 };
 349
 350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 351 {
 352   1, /* GP2GP  */
 353   /* Avoid the use of slow int<->fp moves for spilling by setting
 354      their cost higher than memmov_cost (actual, 4 and 9).  */
 355   9, /* GP2FP  */
 356   9, /* FP2GP  */
 357   1 /* FP2FP  */
 358 };
 359
 360 static const struct cpu_regmove_cost thunderx_regmove_cost =
 361 {
 362   2, /* GP2GP  */
 363   2, /* GP2FP  */
 364   6, /* FP2GP  */
 365   4 /* FP2FP  */
 366 };
 367
 368 static const struct cpu_regmove_cost xgene1_regmove_cost =
 369 {
 370   1, /* GP2GP  */
 371   /* Avoid the use of slow int<->fp moves for spilling by setting
 372      their cost higher than memmov_cost.  */
 373   8, /* GP2FP  */
 374   8, /* FP2GP  */
 375   2 /* FP2FP  */
 376 };
 377
 378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 379 {
 380   2, /* GP2GP  */
 381   /* Avoid the use of int<->fp moves for spilling.  */
 382   6, /* GP2FP  */
 383   6, /* FP2GP  */
 384   4 /* FP2FP  */
 385 };
 386
 387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 388 {
 389   1, /* GP2GP  */
 390   /* Avoid the use of int<->fp moves for spilling.  */
 391   8, /* GP2FP  */
 392   8, /* FP2GP  */
 393   4  /* FP2FP  */
 394 };
 395
 396 /* Generic costs for vector insn classes.  */
 397 static const struct cpu_vector_cost generic_vector_cost =
 398 {
 399   1, /* scalar_int_stmt_cost  */
 400   1, /* scalar_fp_stmt_cost  */
 401   1, /* scalar_load_cost  */
 402   1, /* scalar_store_cost  */
 403   1, /* vec_int_stmt_cost  */
 404   1, /* vec_fp_stmt_cost  */
 405   2, /* vec_permute_cost  */
 406   1, /* vec_to_scalar_cost  */
 407   1, /* scalar_to_vec_cost  */
 408   1, /* vec_align_load_cost  */
 409   1, /* vec_unalign_load_cost  */
 410   1, /* vec_unalign_store_cost  */
 411   1, /* vec_store_cost  */
 412   3, /* cond_taken_branch_cost  */
 413   1 /* cond_not_taken_branch_cost  */
 414 };
 415
 416 /* ThunderX costs for vector insn classes.  */
 417 static const struct cpu_vector_cost thunderx_vector_cost =
 418 {
 419   1, /* scalar_int_stmt_cost  */
 420   1, /* scalar_fp_stmt_cost  */
 421   3, /* scalar_load_cost  */
 422   1, /* scalar_store_cost  */
 423   4, /* vec_int_stmt_cost  */
 424   1, /* vec_fp_stmt_cost  */
 425   4, /* vec_permute_cost  */
 426   2, /* vec_to_scalar_cost  */
 427   2, /* scalar_to_vec_cost  */
 428   3, /* vec_align_load_cost  */
 429   5, /* vec_unalign_load_cost  */
 430   5, /* vec_unalign_store_cost  */
 431   1, /* vec_store_cost  */
 432   3, /* cond_taken_branch_cost  */
 433   3 /* cond_not_taken_branch_cost  */
 434 };
 435
 436 /* Generic costs for vector insn classes.  */
 437 static const struct cpu_vector_cost cortexa57_vector_cost =
 438 {
 439   1, /* scalar_int_stmt_cost  */
 440   1, /* scalar_fp_stmt_cost  */
 441   4, /* scalar_load_cost  */
 442   1, /* scalar_store_cost  */
 443   2, /* vec_int_stmt_cost  */
 444   2, /* vec_fp_stmt_cost  */
 445   3, /* vec_permute_cost  */
 446   8, /* vec_to_scalar_cost  */
 447   8, /* scalar_to_vec_cost  */
 448   4, /* vec_align_load_cost  */
 449   4, /* vec_unalign_load_cost  */
 450   1, /* vec_unalign_store_cost  */
 451   1, /* vec_store_cost  */
 452   1, /* cond_taken_branch_cost  */
 453   1 /* cond_not_taken_branch_cost  */
 454 };
 455
 456 static const struct cpu_vector_cost exynosm1_vector_cost =
 457 {
 458   1, /* scalar_int_stmt_cost  */
 459   1, /* scalar_fp_stmt_cost  */
 460   5, /* scalar_load_cost  */
 461   1, /* scalar_store_cost  */
 462   3, /* vec_int_stmt_cost  */
 463   3, /* vec_fp_stmt_cost  */
 464   3, /* vec_permute_cost  */
 465   3, /* vec_to_scalar_cost  */
 466   3, /* scalar_to_vec_cost  */
 467   5, /* vec_align_load_cost  */
 468   5, /* vec_unalign_load_cost  */
 469   1, /* vec_unalign_store_cost  */
 470   1, /* vec_store_cost  */
 471   1, /* cond_taken_branch_cost  */
 472   1 /* cond_not_taken_branch_cost  */
 473 };
 474
 475 /* Generic costs for vector insn classes.  */
 476 static const struct cpu_vector_cost xgene1_vector_cost =
 477 {
 478   1, /* scalar_int_stmt_cost  */
 479   1, /* scalar_fp_stmt_cost  */
 480   5, /* scalar_load_cost  */
 481   1, /* scalar_store_cost  */
 482   2, /* vec_int_stmt_cost  */
 483   2, /* vec_fp_stmt_cost  */
 484   2, /* vec_permute_cost  */
 485   4, /* vec_to_scalar_cost  */
 486   4, /* scalar_to_vec_cost  */
 487   10, /* vec_align_load_cost  */
 488   10, /* vec_unalign_load_cost  */
 489   2, /* vec_unalign_store_cost  */
 490   2, /* vec_store_cost  */
 491   2, /* cond_taken_branch_cost  */
 492   1 /* cond_not_taken_branch_cost  */
 493 };
 494
 495 /* Costs for vector insn classes for Vulcan.  */
 496 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 497 {
 498   1, /* scalar_int_stmt_cost  */
 499   6, /* scalar_fp_stmt_cost  */
 500   4, /* scalar_load_cost  */
 501   1, /* scalar_store_cost  */
 502   5, /* vec_int_stmt_cost  */
 503   6, /* vec_fp_stmt_cost  */
 504   3, /* vec_permute_cost  */
 505   6, /* vec_to_scalar_cost  */
 506   5, /* scalar_to_vec_cost  */
 507   8, /* vec_align_load_cost  */
 508   8, /* vec_unalign_load_cost  */
 509   4, /* vec_unalign_store_cost  */
 510   4, /* vec_store_cost  */
 511   2, /* cond_taken_branch_cost  */
 512   1  /* cond_not_taken_branch_cost  */
 513 };
 514
 515 /* Generic costs for branch instructions.  */
 516 static const struct cpu_branch_cost generic_branch_cost =
 517 {
 518   1,  /* Predictable.  */
 519   3   /* Unpredictable.  */
 520 };
 521
 522 /* Generic approximation modes.  */
 523 static const cpu_approx_modes generic_approx_modes =
 524 {
 525   AARCH64_APPROX_NONE,  /* division  */
 526   AARCH64_APPROX_NONE,  /* sqrt  */
 527   AARCH64_APPROX_NONE   /* recip_sqrt  */
 528 };
 529
 530 /* Approximation modes for Exynos M1.  */
 531 static const cpu_approx_modes exynosm1_approx_modes =
 532 {
 533   AARCH64_APPROX_NONE,  /* division  */
 534   AARCH64_APPROX_ALL,   /* sqrt  */
 535   AARCH64_APPROX_ALL    /* recip_sqrt  */
 536 };
 537
 538 /* Approximation modes for X-Gene 1.  */
 539 static const cpu_approx_modes xgene1_approx_modes =
 540 {
 541   AARCH64_APPROX_NONE,  /* division  */
 542   AARCH64_APPROX_NONE,  /* sqrt  */
 543   AARCH64_APPROX_ALL    /* recip_sqrt  */
 544 };
 545
 546 /* Generic prefetch settings (which disable prefetch).  */
 547 static const cpu_prefetch_tune generic_prefetch_tune =
 548 {
 549   0,                    /* num_slots  */
 550   -1,                   /* l1_cache_size  */
 551   -1,                   /* l1_cache_line_size  */
 552   -1,                   /* l2_cache_size  */
 553   -1                    /* default_opt_level  */
 554 };
 555
 556 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 557 {
 558   0,                    /* num_slots  */
 559   -1,                   /* l1_cache_size  */
 560   64,                   /* l1_cache_line_size  */
 561   -1,                   /* l2_cache_size  */
 562   -1                    /* default_opt_level  */
 563 };
 564
 565 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 566 {
 567   4,                    /* num_slots  */
 568   32,                   /* l1_cache_size  */
 569   64,                   /* l1_cache_line_size  */
 570   512,                  /* l2_cache_size  */
 571   -1                    /* default_opt_level  */
 572 };
 573
 574 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 575 {
 576   8,                    /* num_slots  */
 577   32,                   /* l1_cache_size  */
 578   128,                  /* l1_cache_line_size  */
 579   16*1024,              /* l2_cache_size  */
 580   3                     /* default_opt_level  */
 581 };
 582
 583 static const cpu_prefetch_tune thunderx_prefetch_tune =
 584 {
 585   8,                    /* num_slots  */
 586   32,                   /* l1_cache_size  */
 587   128,                  /* l1_cache_line_size  */
 588   -1,                   /* l2_cache_size  */
 589   -1                    /* default_opt_level  */
 590 };
 591
 592 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 593 {
 594   8,                    /* num_slots  */
 595   32,                   /* l1_cache_size  */
 596   64,                   /* l1_cache_line_size  */
 597   256,                  /* l2_cache_size  */
 598   -1                    /* default_opt_level  */
 599 };
 600
 601 static const struct tune_params generic_tunings =
 602 {
 603   &cortexa57_extra_costs,
 604   &generic_addrcost_table,
 605   &generic_regmove_cost,
 606   &generic_vector_cost,
 607   &generic_branch_cost,
 608   &generic_approx_modes,
 609   4, /* memmov_cost  */
 610   2, /* issue_rate  */
 611   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 612   8,    /* function_align.  */
 613   4,    /* jump_align.  */
 614   8,    /* loop_align.  */
 615   2,    /* int_reassoc_width.  */
 616   4,    /* fp_reassoc_width.  */
 617   1,    /* vec_reassoc_width.  */
 618   2,    /* min_div_recip_mul_sf.  */
 619   2,    /* min_div_recip_mul_df.  */
 620   0,    /* max_case_values.  */
 621   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 622   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 623   &generic_prefetch_tune
 624 };
 625
 626 static const struct tune_params cortexa35_tunings =
 627 {
 628   &cortexa53_extra_costs,
 629   &generic_addrcost_table,
 630   &cortexa53_regmove_cost,
 631   &generic_vector_cost,
 632   &generic_branch_cost,
 633   &generic_approx_modes,
 634   4, /* memmov_cost  */
 635   1, /* issue_rate  */
 636   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 637    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 638   16,   /* function_align.  */
 639   4,    /* jump_align.  */
 640   8,    /* loop_align.  */
 641   2,    /* int_reassoc_width.  */
 642   4,    /* fp_reassoc_width.  */
 643   1,    /* vec_reassoc_width.  */
 644   2,    /* min_div_recip_mul_sf.  */
 645   2,    /* min_div_recip_mul_df.  */
 646   0,    /* max_case_values.  */
 647   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 648   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 649   &generic_prefetch_tune
 650 };
 651
 652 static const struct tune_params cortexa53_tunings =
 653 {
 654   &cortexa53_extra_costs,
 655   &generic_addrcost_table,
 656   &cortexa53_regmove_cost,
 657   &generic_vector_cost,
 658   &generic_branch_cost,
 659   &generic_approx_modes,
 660   4, /* memmov_cost  */
 661   2, /* issue_rate  */
 662   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 663    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 664   16,   /* function_align.  */
 665   4,    /* jump_align.  */
 666   8,    /* loop_align.  */
 667   2,    /* int_reassoc_width.  */
 668   4,    /* fp_reassoc_width.  */
 669   1,    /* vec_reassoc_width.  */
 670   2,    /* min_div_recip_mul_sf.  */
 671   2,    /* min_div_recip_mul_df.  */
 672   0,    /* max_case_values.  */
 673   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 674   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 675   &generic_prefetch_tune
 676 };
 677
 678 static const struct tune_params cortexa57_tunings =
 679 {
 680   &cortexa57_extra_costs,
 681   &generic_addrcost_table,
 682   &cortexa57_regmove_cost,
 683   &cortexa57_vector_cost,
 684   &generic_branch_cost,
 685   &generic_approx_modes,
 686   4, /* memmov_cost  */
 687   3, /* issue_rate  */
 688   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 689    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 690   16,   /* function_align.  */
 691   4,    /* jump_align.  */
 692   8,    /* loop_align.  */
 693   2,    /* int_reassoc_width.  */
 694   4,    /* fp_reassoc_width.  */
 695   1,    /* vec_reassoc_width.  */
 696   2,    /* min_div_recip_mul_sf.  */
 697   2,    /* min_div_recip_mul_df.  */
 698   0,    /* max_case_values.  */
 699   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 700   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 701   &generic_prefetch_tune
 702 };
 703
 704 static const struct tune_params cortexa72_tunings =
 705 {
 706   &cortexa57_extra_costs,
 707   &generic_addrcost_table,
 708   &cortexa57_regmove_cost,
 709   &cortexa57_vector_cost,
 710   &generic_branch_cost,
 711   &generic_approx_modes,
 712   4, /* memmov_cost  */
 713   3, /* issue_rate  */
 714   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 715    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 716   16,   /* function_align.  */
 717   4,    /* jump_align.  */
 718   8,    /* loop_align.  */
 719   2,    /* int_reassoc_width.  */
 720   4,    /* fp_reassoc_width.  */
 721   1,    /* vec_reassoc_width.  */
 722   2,    /* min_div_recip_mul_sf.  */
 723   2,    /* min_div_recip_mul_df.  */
 724   0,    /* max_case_values.  */
 725   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 726   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 727   &generic_prefetch_tune
 728 };
 729
 730 static const struct tune_params cortexa73_tunings =
 731 {
 732   &cortexa57_extra_costs,
 733   &generic_addrcost_table,
 734   &cortexa57_regmove_cost,
 735   &cortexa57_vector_cost,
 736   &generic_branch_cost,
 737   &generic_approx_modes,
 738   4, /* memmov_cost.  */
 739   2, /* issue_rate.  */
 740   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 741    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 742   16,   /* function_align.  */
 743   4,    /* jump_align.  */
 744   8,    /* loop_align.  */
 745   2,    /* int_reassoc_width.  */
 746   4,    /* fp_reassoc_width.  */
 747   1,    /* vec_reassoc_width.  */
 748   2,    /* min_div_recip_mul_sf.  */
 749   2,    /* min_div_recip_mul_df.  */
 750   0,    /* max_case_values.  */
 751   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 752   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 753   &generic_prefetch_tune
 754 };
 755
 756
 757
 758 static const struct tune_params exynosm1_tunings =
 759 {
 760   &exynosm1_extra_costs,
 761   &exynosm1_addrcost_table,
 762   &exynosm1_regmove_cost,
 763   &exynosm1_vector_cost,
 764   &generic_branch_cost,
 765   &exynosm1_approx_modes,
 766   4,    /* memmov_cost  */
 767   3,    /* issue_rate  */
 768   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 769   4,    /* function_align.  */
 770   4,    /* jump_align.  */
 771   4,    /* loop_align.  */
 772   2,    /* int_reassoc_width.  */
 773   4,    /* fp_reassoc_width.  */
 774   1,    /* vec_reassoc_width.  */
 775   2,    /* min_div_recip_mul_sf.  */
 776   2,    /* min_div_recip_mul_df.  */
 777   48,   /* max_case_values.  */
 778   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 779   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 780   &exynosm1_prefetch_tune
 781 };
 782
 783 static const struct tune_params thunderxt88_tunings =
 784 {
 785   &thunderx_extra_costs,
 786   &generic_addrcost_table,
 787   &thunderx_regmove_cost,
 788   &thunderx_vector_cost,
 789   &generic_branch_cost,
 790   &generic_approx_modes,
 791   6, /* memmov_cost  */
 792   2, /* issue_rate  */
 793   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 794   8,    /* function_align.  */
 795   8,    /* jump_align.  */
 796   8,    /* loop_align.  */
 797   2,    /* int_reassoc_width.  */
 798   4,    /* fp_reassoc_width.  */
 799   1,    /* vec_reassoc_width.  */
 800   2,    /* min_div_recip_mul_sf.  */
 801   2,    /* min_div_recip_mul_df.  */
 802   0,    /* max_case_values.  */
 803   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 804   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 805   &thunderxt88_prefetch_tune
 806 };
 807
 808 static const struct tune_params thunderx_tunings =
 809 {
 810   &thunderx_extra_costs,
 811   &generic_addrcost_table,
 812   &thunderx_regmove_cost,
 813   &thunderx_vector_cost,
 814   &generic_branch_cost,
 815   &generic_approx_modes,
 816   6, /* memmov_cost  */
 817   2, /* issue_rate  */
 818   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 819   8,    /* function_align.  */
 820   8,    /* jump_align.  */
 821   8,    /* loop_align.  */
 822   2,    /* int_reassoc_width.  */
 823   4,    /* fp_reassoc_width.  */
 824   1,    /* vec_reassoc_width.  */
 825   2,    /* min_div_recip_mul_sf.  */
 826   2,    /* min_div_recip_mul_df.  */
 827   0,    /* max_case_values.  */
 828   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 829   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 830    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 831   &thunderx_prefetch_tune
 832 };
 833
 834 static const struct tune_params xgene1_tunings =
 835 {
 836   &xgene1_extra_costs,
 837   &xgene1_addrcost_table,
 838   &xgene1_regmove_cost,
 839   &xgene1_vector_cost,
 840   &generic_branch_cost,
 841   &xgene1_approx_modes,
 842   6, /* memmov_cost  */
 843   4, /* issue_rate  */
 844   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 845   16,   /* function_align.  */
 846   8,    /* jump_align.  */
 847   16,   /* loop_align.  */
 848   2,    /* int_reassoc_width.  */
 849   4,    /* fp_reassoc_width.  */
 850   1,    /* vec_reassoc_width.  */
 851   2,    /* min_div_recip_mul_sf.  */
 852   2,    /* min_div_recip_mul_df.  */
 853   0,    /* max_case_values.  */
 854   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 855   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 856   &generic_prefetch_tune
 857 };
 858
 859 static const struct tune_params qdf24xx_tunings =
 860 {
 861   &qdf24xx_extra_costs,
 862   &generic_addrcost_table,
 863   &qdf24xx_regmove_cost,
 864   &generic_vector_cost,
 865   &generic_branch_cost,
 866   &generic_approx_modes,
 867   4, /* memmov_cost  */
 868   4, /* issue_rate  */
 869   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 870    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 871   16,   /* function_align.  */
 872   8,    /* jump_align.  */
 873   16,   /* loop_align.  */
 874   2,    /* int_reassoc_width.  */
 875   4,    /* fp_reassoc_width.  */
 876   1,    /* vec_reassoc_width.  */
 877   2,    /* min_div_recip_mul_sf.  */
 878   2,    /* min_div_recip_mul_df.  */
 879   0,    /* max_case_values.  */
 880   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 881   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 882   &qdf24xx_prefetch_tune
 883 };
 884
 885 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 886    for now.  */
 887 static const struct tune_params saphira_tunings =
 888 {
 889   &generic_extra_costs,
 890   &generic_addrcost_table,
 891   &generic_regmove_cost,
 892   &generic_vector_cost,
 893   &generic_branch_cost,
 894   &generic_approx_modes,
 895   4, /* memmov_cost  */
 896   4, /* issue_rate  */
 897   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 898    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 899   16,   /* function_align.  */
 900   8,    /* jump_align.  */
 901   16,   /* loop_align.  */
 902   2,    /* int_reassoc_width.  */
 903   4,    /* fp_reassoc_width.  */
 904   1,    /* vec_reassoc_width.  */
 905   2,    /* min_div_recip_mul_sf.  */
 906   2,    /* min_div_recip_mul_df.  */
 907   0,    /* max_case_values.  */
 908   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 909   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 910   &generic_prefetch_tune
 911 };
 912
 913 static const struct tune_params thunderx2t99_tunings =
 914 {
 915   &thunderx2t99_extra_costs,
 916   &thunderx2t99_addrcost_table,
 917   &thunderx2t99_regmove_cost,
 918   &thunderx2t99_vector_cost,
 919   &generic_branch_cost,
 920   &generic_approx_modes,
 921   4, /* memmov_cost.  */
 922   4, /* issue_rate.  */
 923   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 924    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 925   16,   /* function_align.  */
 926   8,    /* jump_align.  */
 927   16,   /* loop_align.  */
 928   3,    /* int_reassoc_width.  */
 929   2,    /* fp_reassoc_width.  */
 930   2,    /* vec_reassoc_width.  */
 931   2,    /* min_div_recip_mul_sf.  */
 932   2,    /* min_div_recip_mul_df.  */
 933   0,    /* max_case_values.  */
 934   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 935   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 936   &thunderx2t99_prefetch_tune
 937 };
 938
 939 /* Support for fine-grained override of the tuning structures.  */
 940 struct aarch64_tuning_override_function
 941 {
 942   const char* name;
 943   void (*parse_override)(const char*, struct tune_params*);
 944 };
 945
 946 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 947 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 948
 949 static const struct aarch64_tuning_override_function
 950 aarch64_tuning_override_functions[] =
 951 {
 952   { "fuse", aarch64_parse_fuse_string },
 953   { "tune", aarch64_parse_tune_string },
 954   { NULL, NULL }
 955 };
 956
 957 /* A processor implementing AArch64.  */
 958 struct processor
 959 {
 960   const char *const name;
 961   enum aarch64_processor ident;
 962   enum aarch64_processor sched_core;
 963   enum aarch64_arch arch;
 964   unsigned architecture_version;
 965   const unsigned long flags;
 966   const struct tune_params *const tune;
 967 };
 968
 969 /* Architectures implementing AArch64.  */
 970 static const struct processor all_architectures[] =
 971 {
 972 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 973   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 974 #include "aarch64-arches.def"
 975   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 976 };
 977
 978 /* Processor cores implementing AArch64.  */
 979 static const struct processor all_cores[] =
 980 {
 981 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 982   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 983   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 984   FLAGS, &COSTS##_tunings},
 985 #include "aarch64-cores.def"
 986   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 987     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 988   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 989 };
 990
 991
 992 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 993    handling code or by target attributes.  */
 994 static const struct processor *selected_arch;
 995 static const struct processor *selected_cpu;
 996 static const struct processor *selected_tune;
 997
 998 /* The current tuning set.  */
 999 struct tune_params aarch64_tune_params = generic_tunings;
1000
1001 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1002
1003 /* An ISA extension in the co-processor and main instruction set space.  */
1004 struct aarch64_option_extension
1005 {
1006   const char *const name;
1007   const unsigned long flags_on;
1008   const unsigned long flags_off;
1009 };
1010
1011 typedef enum aarch64_cond_code
1012 {
1013   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1014   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1015   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1016 }
1017 aarch64_cc;
1018
1019 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1020
1021 /* The condition codes of the processor, and the inverse function.  */
1022 static const char * const aarch64_condition_codes[] =
1023 {
1024   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1025   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1026 };
1027
1028 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1029 const char *
1030 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1031                         const char * branch_format)
1032 {
1033     rtx_code_label * tmp_label = gen_label_rtx ();
1034     char label_buf[256];
1035     char buffer[128];
1036     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1037                                  CODE_LABEL_NUMBER (tmp_label));
1038     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1039     rtx dest_label = operands[pos_label];
1040     operands[pos_label] = tmp_label;
1041
1042     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1043     output_asm_insn (buffer, operands);
1044
1045     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1046     operands[pos_label] = dest_label;
1047     output_asm_insn (buffer, operands);
1048     return "";
1049 }
1050
1051 void
1052 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1053 {
1054   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1055   if (TARGET_GENERAL_REGS_ONLY)
1056     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1057   else
1058     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1059 }
1060
1061 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1062    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1063    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1064    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1065    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1066    irrespectively of its cost results in bad allocations with many redundant
1067    int<->FP moves which are expensive on various cores.
1068    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1069    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1070    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1071    Otherwise set the allocno class depending on the mode.
1072    The result of this is that it is no longer inefficient to have a higher
1073    memory move cost than the register move cost.
1074 */
1075
1076 static reg_class_t
1077 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1078                                          reg_class_t best_class)
1079 {
1080   machine_mode mode;
1081
1082   if (allocno_class != ALL_REGS)
1083     return allocno_class;
1084
1085   if (best_class != ALL_REGS)
1086     return best_class;
1087
1088   mode = PSEUDO_REGNO_MODE (regno);
1089   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1090 }
1091
1092 static unsigned int
1093 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1094 {
1095   if (GET_MODE_UNIT_SIZE (mode) == 4)
1096     return aarch64_tune_params.min_div_recip_mul_sf;
1097   return aarch64_tune_params.min_div_recip_mul_df;
1098 }
1099
1100 /* Return the reassociation width of treeop OPC with mode MODE.  */
1101 static int
1102 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1103 {
1104   if (VECTOR_MODE_P (mode))
1105     return aarch64_tune_params.vec_reassoc_width;
1106   if (INTEGRAL_MODE_P (mode))
1107     return aarch64_tune_params.int_reassoc_width;
1108   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1109   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1110     return aarch64_tune_params.fp_reassoc_width;
1111   return 1;
1112 }
1113
1114 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1115 unsigned
1116 aarch64_dbx_register_number (unsigned regno)
1117 {
1118    if (GP_REGNUM_P (regno))
1119      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1120    else if (regno == SP_REGNUM)
1121      return AARCH64_DWARF_SP;
1122    else if (FP_REGNUM_P (regno))
1123      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1124    else if (PR_REGNUM_P (regno))
1125      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1126    else if (regno == VG_REGNUM)
1127      return AARCH64_DWARF_VG;
1128
1129    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1130       equivalent DWARF register.  */
1131    return DWARF_FRAME_REGISTERS;
1132 }
1133
1134 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1135 static bool
1136 aarch64_advsimd_struct_mode_p (machine_mode mode)
1137 {
1138   return (TARGET_SIMD
1139           && (mode == OImode || mode == CImode || mode == XImode));
1140 }
1141
1142 /* Return true if MODE is an SVE predicate mode.  */
1143 static bool
1144 aarch64_sve_pred_mode_p (machine_mode mode)
1145 {
1146   return (TARGET_SVE
1147           && (mode == VNx16BImode
1148               || mode == VNx8BImode
1149               || mode == VNx4BImode
1150               || mode == VNx2BImode));
1151 }
1152
1153 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1154 const unsigned int VEC_ADVSIMD  = 1;
1155 const unsigned int VEC_SVE_DATA = 2;
1156 const unsigned int VEC_SVE_PRED = 4;
1157 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1158    a structure of 2, 3 or 4 vectors.  */
1159 const unsigned int VEC_STRUCT   = 8;
1160 /* Useful combinations of the above.  */
1161 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1162 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1163
1164 /* Return a set of flags describing the vector properties of mode MODE.
1165    Ignore modes that are not supported by the current target.  */
1166 static unsigned int
1167 aarch64_classify_vector_mode (machine_mode mode)
1168 {
1169   if (aarch64_advsimd_struct_mode_p (mode))
1170     return VEC_ADVSIMD | VEC_STRUCT;
1171
1172   if (aarch64_sve_pred_mode_p (mode))
1173     return VEC_SVE_PRED;
1174
1175   scalar_mode inner = GET_MODE_INNER (mode);
1176   if (VECTOR_MODE_P (mode)
1177       && (inner == QImode
1178           || inner == HImode
1179           || inner == HFmode
1180           || inner == SImode
1181           || inner == SFmode
1182           || inner == DImode
1183           || inner == DFmode))
1184     {
1185       if (TARGET_SVE)
1186         {
1187           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1188             return VEC_SVE_DATA;
1189           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1190               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1191               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1192             return VEC_SVE_DATA | VEC_STRUCT;
1193         }
1194
1195       /* This includes V1DF but not V1DI (which doesn't exist).  */
1196       if (TARGET_SIMD
1197           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1198               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1199         return VEC_ADVSIMD;
1200     }
1201
1202   return 0;
1203 }
1204
1205 /* Return true if MODE is any of the data vector modes, including
1206    structure modes.  */
1207 static bool
1208 aarch64_vector_data_mode_p (machine_mode mode)
1209 {
1210   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1211 }
1212
1213 /* Return true if MODE is an SVE data vector mode; either a single vector
1214    or a structure of vectors.  */
1215 static bool
1216 aarch64_sve_data_mode_p (machine_mode mode)
1217 {
1218   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1219 }
1220
1221 /* Implement target hook TARGET_ARRAY_MODE.  */
1222 static opt_machine_mode
1223 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1224 {
1225   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1226       && IN_RANGE (nelems, 2, 4))
1227     return mode_for_vector (GET_MODE_INNER (mode),
1228                             GET_MODE_NUNITS (mode) * nelems);
1229
1230   return opt_machine_mode ();
1231 }
1232
1233 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1234 static bool
1235 aarch64_array_mode_supported_p (machine_mode mode,
1236                                 unsigned HOST_WIDE_INT nelems)
1237 {
1238   if (TARGET_SIMD
1239       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1240           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1241       && (nelems >= 2 && nelems <= 4))
1242     return true;
1243
1244   return false;
1245 }
1246
1247 /* Return the SVE predicate mode to use for elements that have
1248    ELEM_NBYTES bytes, if such a mode exists.  */
1249
1250 opt_machine_mode
1251 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1252 {
1253   if (TARGET_SVE)
1254     {
1255       if (elem_nbytes == 1)
1256         return VNx16BImode;
1257       if (elem_nbytes == 2)
1258         return VNx8BImode;
1259       if (elem_nbytes == 4)
1260         return VNx4BImode;
1261       if (elem_nbytes == 8)
1262         return VNx2BImode;
1263     }
1264   return opt_machine_mode ();
1265 }
1266
1267 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1268
1269 static opt_machine_mode
1270 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1271 {
1272   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1273     {
1274       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1275       machine_mode pred_mode;
1276       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1277         return pred_mode;
1278     }
1279
1280   return default_get_mask_mode (nunits, nbytes);
1281 }
1282
1283 /* Implement TARGET_HARD_REGNO_NREGS.  */
1284
1285 static unsigned int
1286 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1287 {
1288   /* ??? Logically we should only need to provide a value when
1289      HARD_REGNO_MODE_OK says that the combination is valid,
1290      but at the moment we need to handle all modes.  Just ignore
1291      any runtime parts for registers that can't store them.  */
1292   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1293   switch (aarch64_regno_regclass (regno))
1294     {
1295     case FP_REGS:
1296     case FP_LO_REGS:
1297       if (aarch64_sve_data_mode_p (mode))
1298         return exact_div (GET_MODE_SIZE (mode),
1299                           BYTES_PER_SVE_VECTOR).to_constant ();
1300       return CEIL (lowest_size, UNITS_PER_VREG);
1301     case PR_REGS:
1302     case PR_LO_REGS:
1303     case PR_HI_REGS:
1304       return 1;
1305     default:
1306       return CEIL (lowest_size, UNITS_PER_WORD);
1307     }
1308   gcc_unreachable ();
1309 }
1310
1311 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1312
1313 static bool
1314 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1315 {
1316   if (GET_MODE_CLASS (mode) == MODE_CC)
1317     return regno == CC_REGNUM;
1318
1319   if (regno == VG_REGNUM)
1320     /* This must have the same size as _Unwind_Word.  */
1321     return mode == DImode;
1322
1323   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1324   if (vec_flags & VEC_SVE_PRED)
1325     return PR_REGNUM_P (regno);
1326
1327   if (PR_REGNUM_P (regno))
1328     return 0;
1329
1330   if (regno == SP_REGNUM)
1331     /* The purpose of comparing with ptr_mode is to support the
1332        global register variable associated with the stack pointer
1333        register via the syntax of asm ("wsp") in ILP32.  */
1334     return mode == Pmode || mode == ptr_mode;
1335
1336   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1337     return mode == Pmode;
1338
1339   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1340     return true;
1341
1342   if (FP_REGNUM_P (regno))
1343     {
1344       if (vec_flags & VEC_STRUCT)
1345         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1346       else
1347         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1348     }
1349
1350   return false;
1351 }
1352
1353 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1354    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1355    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1356
1357 static bool
1358 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1359 {
1360   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1361 }
1362
1363 /* Implement REGMODE_NATURAL_SIZE.  */
1364 poly_uint64
1365 aarch64_regmode_natural_size (machine_mode mode)
1366 {
1367   /* The natural size for SVE data modes is one SVE data vector,
1368      and similarly for predicates.  We can't independently modify
1369      anything smaller than that.  */
1370   /* ??? For now, only do this for variable-width SVE registers.
1371      Doing it for constant-sized registers breaks lower-subreg.c.  */
1372   /* ??? And once that's fixed, we should probably have similar
1373      code for Advanced SIMD.  */
1374   if (!aarch64_sve_vg.is_constant ())
1375     {
1376       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1377       if (vec_flags & VEC_SVE_PRED)
1378         return BYTES_PER_SVE_PRED;
1379       if (vec_flags & VEC_SVE_DATA)
1380         return BYTES_PER_SVE_VECTOR;
1381     }
1382   return UNITS_PER_WORD;
1383 }
1384
1385 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1386 machine_mode
1387 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1388                                      machine_mode mode)
1389 {
1390   /* The predicate mode determines which bits are significant and
1391      which are "don't care".  Decreasing the number of lanes would
1392      lose data while increasing the number of lanes would make bits
1393      unnecessarily significant.  */
1394   if (PR_REGNUM_P (regno))
1395     return mode;
1396   if (known_ge (GET_MODE_SIZE (mode), 4))
1397     return mode;
1398   else
1399     return SImode;
1400 }
1401
1402 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1403    that strcpy from constants will be faster.  */
1404
1405 static HOST_WIDE_INT
1406 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1407 {
1408   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1409     return MAX (align, BITS_PER_WORD);
1410   return align;
1411 }
1412
1413 /* Return true if calls to DECL should be treated as
1414    long-calls (ie called via a register).  */
1415 static bool
1416 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1417 {
1418   return false;
1419 }
1420
1421 /* Return true if calls to symbol-ref SYM should be treated as
1422    long-calls (ie called via a register).  */
1423 bool
1424 aarch64_is_long_call_p (rtx sym)
1425 {
1426   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1427 }
1428
1429 /* Return true if calls to symbol-ref SYM should not go through
1430    plt stubs.  */
1431
1432 bool
1433 aarch64_is_noplt_call_p (rtx sym)
1434 {
1435   const_tree decl = SYMBOL_REF_DECL (sym);
1436
1437   if (flag_pic
1438       && decl
1439       && (!flag_plt
1440           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1441       && !targetm.binds_local_p (decl))
1442     return true;
1443
1444   return false;
1445 }
1446
1447 /* Return true if the offsets to a zero/sign-extract operation
1448    represent an expression that matches an extend operation.  The
1449    operands represent the paramters from
1450
1451    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1452 bool
1453 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1454                                 rtx extract_imm)
1455 {
1456   HOST_WIDE_INT mult_val, extract_val;
1457
1458   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1459     return false;
1460
1461   mult_val = INTVAL (mult_imm);
1462   extract_val = INTVAL (extract_imm);
1463
1464   if (extract_val > 8
1465       && extract_val < GET_MODE_BITSIZE (mode)
1466       && exact_log2 (extract_val & ~7) > 0
1467       && (extract_val & 7) <= 4
1468       && mult_val == (1 << (extract_val & 7)))
1469     return true;
1470
1471   return false;
1472 }
1473
1474 /* Emit an insn that's a simple single-set.  Both the operands must be
1475    known to be valid.  */
1476 inline static rtx_insn *
1477 emit_set_insn (rtx x, rtx y)
1478 {
1479   return emit_insn (gen_rtx_SET (x, y));
1480 }
1481
1482 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1483    return the rtx for register 0 in the proper mode.  */
1484 rtx
1485 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1486 {
1487   machine_mode mode = SELECT_CC_MODE (code, x, y);
1488   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1489
1490   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1491   return cc_reg;
1492 }
1493
1494 /* Build the SYMBOL_REF for __tls_get_addr.  */
1495
1496 static GTY(()) rtx tls_get_addr_libfunc;
1497
1498 rtx
1499 aarch64_tls_get_addr (void)
1500 {
1501   if (!tls_get_addr_libfunc)
1502     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1503   return tls_get_addr_libfunc;
1504 }
1505
1506 /* Return the TLS model to use for ADDR.  */
1507
1508 static enum tls_model
1509 tls_symbolic_operand_type (rtx addr)
1510 {
1511   enum tls_model tls_kind = TLS_MODEL_NONE;
1512   if (GET_CODE (addr) == CONST)
1513     {
1514       poly_int64 addend;
1515       rtx sym = strip_offset (addr, &addend);
1516       if (GET_CODE (sym) == SYMBOL_REF)
1517         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1518     }
1519   else if (GET_CODE (addr) == SYMBOL_REF)
1520     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1521
1522   return tls_kind;
1523 }
1524
1525 /* We'll allow lo_sum's in addresses in our legitimate addresses
1526    so that combine would take care of combining addresses where
1527    necessary, but for generation purposes, we'll generate the address
1528    as :
1529    RTL                               Absolute
1530    tmp = hi (symbol_ref);            adrp  x1, foo
1531    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1532                                      nop
1533
1534    PIC                               TLS
1535    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1536    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1537                                      bl   __tls_get_addr
1538                                      nop
1539
1540    Load TLS symbol, depending on TLS mechanism and TLS access model.
1541
1542    Global Dynamic - Traditional TLS:
1543    adrp tmp, :tlsgd:imm
1544    add  dest, tmp, #:tlsgd_lo12:imm
1545    bl   __tls_get_addr
1546
1547    Global Dynamic - TLS Descriptors:
1548    adrp dest, :tlsdesc:imm
1549    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1550    add  dest, dest, #:tlsdesc_lo12:imm
1551    blr  tmp
1552    mrs  tp, tpidr_el0
1553    add  dest, dest, tp
1554
1555    Initial Exec:
1556    mrs  tp, tpidr_el0
1557    adrp tmp, :gottprel:imm
1558    ldr  dest, [tmp, #:gottprel_lo12:imm]
1559    add  dest, dest, tp
1560
1561    Local Exec:
1562    mrs  tp, tpidr_el0
1563    add  t0, tp, #:tprel_hi12:imm, lsl #12
1564    add  t0, t0, #:tprel_lo12_nc:imm
1565 */
1566
1567 static void
1568 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1569                                    enum aarch64_symbol_type type)
1570 {
1571   switch (type)
1572     {
1573     case SYMBOL_SMALL_ABSOLUTE:
1574       {
1575         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1576         rtx tmp_reg = dest;
1577         machine_mode mode = GET_MODE (dest);
1578
1579         gcc_assert (mode == Pmode || mode == ptr_mode);
1580
1581         if (can_create_pseudo_p ())
1582           tmp_reg = gen_reg_rtx (mode);
1583
1584         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1585         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1586         return;
1587       }
1588
1589     case SYMBOL_TINY_ABSOLUTE:
1590       emit_insn (gen_rtx_SET (dest, imm));
1591       return;
1592
1593     case SYMBOL_SMALL_GOT_28K:
1594       {
1595         machine_mode mode = GET_MODE (dest);
1596         rtx gp_rtx = pic_offset_table_rtx;
1597         rtx insn;
1598         rtx mem;
1599
1600         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1601            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1602            decide rtx costs, in which case pic_offset_table_rtx is not
1603            initialized.  For that case no need to generate the first adrp
1604            instruction as the final cost for global variable access is
1605            one instruction.  */
1606         if (gp_rtx != NULL)
1607           {
1608             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1609                using the page base as GOT base, the first page may be wasted,
1610                in the worst scenario, there is only 28K space for GOT).
1611
1612                The generate instruction sequence for accessing global variable
1613                is:
1614
1615                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1616
1617                Only one instruction needed. But we must initialize
1618                pic_offset_table_rtx properly.  We generate initialize insn for
1619                every global access, and allow CSE to remove all redundant.
1620
1621                The final instruction sequences will look like the following
1622                for multiply global variables access.
1623
1624                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1625
1626                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1627                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1628                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1629                  ...  */
1630
1631             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1632             crtl->uses_pic_offset_table = 1;
1633             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1634
1635             if (mode != GET_MODE (gp_rtx))
1636              gp_rtx = gen_lowpart (mode, gp_rtx);
1637
1638           }
1639
1640         if (mode == ptr_mode)
1641           {
1642             if (mode == DImode)
1643               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1644             else
1645               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1646
1647             mem = XVECEXP (SET_SRC (insn), 0, 0);
1648           }
1649         else
1650           {
1651             gcc_assert (mode == Pmode);
1652
1653             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1654             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1655           }
1656
1657         /* The operand is expected to be MEM.  Whenever the related insn
1658            pattern changed, above code which calculate mem should be
1659            updated.  */
1660         gcc_assert (GET_CODE (mem) == MEM);
1661         MEM_READONLY_P (mem) = 1;
1662         MEM_NOTRAP_P (mem) = 1;
1663         emit_insn (insn);
1664         return;
1665       }
1666
1667     case SYMBOL_SMALL_GOT_4G:
1668       {
1669         /* In ILP32, the mode of dest can be either SImode or DImode,
1670            while the got entry is always of SImode size.  The mode of
1671            dest depends on how dest is used: if dest is assigned to a
1672            pointer (e.g. in the memory), it has SImode; it may have
1673            DImode if dest is dereferenced to access the memeory.
1674            This is why we have to handle three different ldr_got_small
1675            patterns here (two patterns for ILP32).  */
1676
1677         rtx insn;
1678         rtx mem;
1679         rtx tmp_reg = dest;
1680         machine_mode mode = GET_MODE (dest);
1681
1682         if (can_create_pseudo_p ())
1683           tmp_reg = gen_reg_rtx (mode);
1684
1685         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1686         if (mode == ptr_mode)
1687           {
1688             if (mode == DImode)
1689               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1690             else
1691               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1692
1693             mem = XVECEXP (SET_SRC (insn), 0, 0);
1694           }
1695         else
1696           {
1697             gcc_assert (mode == Pmode);
1698
1699             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1700             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1701           }
1702
1703         gcc_assert (GET_CODE (mem) == MEM);
1704         MEM_READONLY_P (mem) = 1;
1705         MEM_NOTRAP_P (mem) = 1;
1706         emit_insn (insn);
1707         return;
1708       }
1709
1710     case SYMBOL_SMALL_TLSGD:
1711       {
1712         rtx_insn *insns;
1713         machine_mode mode = GET_MODE (dest);
1714         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1715
1716         start_sequence ();
1717         if (TARGET_ILP32)
1718           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1719         else
1720           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1721         insns = get_insns ();
1722         end_sequence ();
1723
1724         RTL_CONST_CALL_P (insns) = 1;
1725         emit_libcall_block (insns, dest, result, imm);
1726         return;
1727       }
1728
1729     case SYMBOL_SMALL_TLSDESC:
1730       {
1731         machine_mode mode = GET_MODE (dest);
1732         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1733         rtx tp;
1734
1735         gcc_assert (mode == Pmode || mode == ptr_mode);
1736
1737         /* In ILP32, the got entry is always of SImode size.  Unlike
1738            small GOT, the dest is fixed at reg 0.  */
1739         if (TARGET_ILP32)
1740           emit_insn (gen_tlsdesc_small_si (imm));
1741         else
1742           emit_insn (gen_tlsdesc_small_di (imm));
1743         tp = aarch64_load_tp (NULL);
1744
1745         if (mode != Pmode)
1746           tp = gen_lowpart (mode, tp);
1747
1748         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1749         if (REG_P (dest))
1750           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1751         return;
1752       }
1753
1754     case SYMBOL_SMALL_TLSIE:
1755       {
1756         /* In ILP32, the mode of dest can be either SImode or DImode,
1757            while the got entry is always of SImode size.  The mode of
1758            dest depends on how dest is used: if dest is assigned to a
1759            pointer (e.g. in the memory), it has SImode; it may have
1760            DImode if dest is dereferenced to access the memeory.
1761            This is why we have to handle three different tlsie_small
1762            patterns here (two patterns for ILP32).  */
1763         machine_mode mode = GET_MODE (dest);
1764         rtx tmp_reg = gen_reg_rtx (mode);
1765         rtx tp = aarch64_load_tp (NULL);
1766
1767         if (mode == ptr_mode)
1768           {
1769             if (mode == DImode)
1770               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1771             else
1772               {
1773                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1774                 tp = gen_lowpart (mode, tp);
1775               }
1776           }
1777         else
1778           {
1779             gcc_assert (mode == Pmode);
1780             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1781           }
1782
1783         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1784         if (REG_P (dest))
1785           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1786         return;
1787       }
1788
1789     case SYMBOL_TLSLE12:
1790     case SYMBOL_TLSLE24:
1791     case SYMBOL_TLSLE32:
1792     case SYMBOL_TLSLE48:
1793       {
1794         machine_mode mode = GET_MODE (dest);
1795         rtx tp = aarch64_load_tp (NULL);
1796
1797         if (mode != Pmode)
1798           tp = gen_lowpart (mode, tp);
1799
1800         switch (type)
1801           {
1802           case SYMBOL_TLSLE12:
1803             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1804                         (dest, tp, imm));
1805             break;
1806           case SYMBOL_TLSLE24:
1807             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1808                         (dest, tp, imm));
1809           break;
1810           case SYMBOL_TLSLE32:
1811             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1812                         (dest, imm));
1813             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1814                         (dest, dest, tp));
1815           break;
1816           case SYMBOL_TLSLE48:
1817             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1818                         (dest, imm));
1819             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1820                         (dest, dest, tp));
1821             break;
1822           default:
1823             gcc_unreachable ();
1824           }
1825
1826         if (REG_P (dest))
1827           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1828         return;
1829       }
1830
1831     case SYMBOL_TINY_GOT:
1832       emit_insn (gen_ldr_got_tiny (dest, imm));
1833       return;
1834
1835     case SYMBOL_TINY_TLSIE:
1836       {
1837         machine_mode mode = GET_MODE (dest);
1838         rtx tp = aarch64_load_tp (NULL);
1839
1840         if (mode == ptr_mode)
1841           {
1842             if (mode == DImode)
1843               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1844             else
1845               {
1846                 tp = gen_lowpart (mode, tp);
1847                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1848               }
1849           }
1850         else
1851           {
1852             gcc_assert (mode == Pmode);
1853             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1854           }
1855
1856         if (REG_P (dest))
1857           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1858         return;
1859       }
1860
1861     default:
1862       gcc_unreachable ();
1863     }
1864 }
1865
1866 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1867    handle all moves if !can_create_pseudo_p ().  The distinction is
1868    important because, unlike emit_move_insn, the move expanders know
1869    how to force Pmode objects into the constant pool even when the
1870    constant pool address is not itself legitimate.  */
1871 static rtx
1872 aarch64_emit_move (rtx dest, rtx src)
1873 {
1874   return (can_create_pseudo_p ()
1875           ? emit_move_insn (dest, src)
1876           : emit_move_insn_1 (dest, src));
1877 }
1878
1879 /* Apply UNOPTAB to OP and store the result in DEST.  */
1880
1881 static void
1882 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1883 {
1884   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1885   if (dest != tmp)
1886     emit_move_insn (dest, tmp);
1887 }
1888
1889 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1890
1891 static void
1892 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1893 {
1894   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1895                           OPTAB_DIRECT);
1896   if (dest != tmp)
1897     emit_move_insn (dest, tmp);
1898 }
1899
1900 /* Split a 128-bit move operation into two 64-bit move operations,
1901    taking care to handle partial overlap of register to register
1902    copies.  Special cases are needed when moving between GP regs and
1903    FP regs.  SRC can be a register, constant or memory; DST a register
1904    or memory.  If either operand is memory it must not have any side
1905    effects.  */
1906 void
1907 aarch64_split_128bit_move (rtx dst, rtx src)
1908 {
1909   rtx dst_lo, dst_hi;
1910   rtx src_lo, src_hi;
1911
1912   machine_mode mode = GET_MODE (dst);
1913
1914   gcc_assert (mode == TImode || mode == TFmode);
1915   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1916   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1917
1918   if (REG_P (dst) && REG_P (src))
1919     {
1920       int src_regno = REGNO (src);
1921       int dst_regno = REGNO (dst);
1922
1923       /* Handle FP <-> GP regs.  */
1924       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1925         {
1926           src_lo = gen_lowpart (word_mode, src);
1927           src_hi = gen_highpart (word_mode, src);
1928
1929           if (mode == TImode)
1930             {
1931               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1932               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1933             }
1934           else
1935             {
1936               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1937               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1938             }
1939           return;
1940         }
1941       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1942         {
1943           dst_lo = gen_lowpart (word_mode, dst);
1944           dst_hi = gen_highpart (word_mode, dst);
1945
1946           if (mode == TImode)
1947             {
1948               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1949               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1950             }
1951           else
1952             {
1953               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1954               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1955             }
1956           return;
1957         }
1958     }
1959
1960   dst_lo = gen_lowpart (word_mode, dst);
1961   dst_hi = gen_highpart (word_mode, dst);
1962   src_lo = gen_lowpart (word_mode, src);
1963   src_hi = gen_highpart_mode (word_mode, mode, src);
1964
1965   /* At most one pairing may overlap.  */
1966   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1967     {
1968       aarch64_emit_move (dst_hi, src_hi);
1969       aarch64_emit_move (dst_lo, src_lo);
1970     }
1971   else
1972     {
1973       aarch64_emit_move (dst_lo, src_lo);
1974       aarch64_emit_move (dst_hi, src_hi);
1975     }
1976 }
1977
1978 bool
1979 aarch64_split_128bit_move_p (rtx dst, rtx src)
1980 {
1981   return (! REG_P (src)
1982           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1983 }
1984
1985 /* Split a complex SIMD combine.  */
1986
1987 void
1988 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1989 {
1990   machine_mode src_mode = GET_MODE (src1);
1991   machine_mode dst_mode = GET_MODE (dst);
1992
1993   gcc_assert (VECTOR_MODE_P (dst_mode));
1994   gcc_assert (register_operand (dst, dst_mode)
1995               && register_operand (src1, src_mode)
1996               && register_operand (src2, src_mode));
1997
1998   rtx (*gen) (rtx, rtx, rtx);
1999
2000   switch (src_mode)
2001     {
2002     case E_V8QImode:
2003       gen = gen_aarch64_simd_combinev8qi;
2004       break;
2005     case E_V4HImode:
2006       gen = gen_aarch64_simd_combinev4hi;
2007       break;
2008     case E_V2SImode:
2009       gen = gen_aarch64_simd_combinev2si;
2010       break;
2011     case E_V4HFmode:
2012       gen = gen_aarch64_simd_combinev4hf;
2013       break;
2014     case E_V2SFmode:
2015       gen = gen_aarch64_simd_combinev2sf;
2016       break;
2017     case E_DImode:
2018       gen = gen_aarch64_simd_combinedi;
2019       break;
2020     case E_DFmode:
2021       gen = gen_aarch64_simd_combinedf;
2022       break;
2023     default:
2024       gcc_unreachable ();
2025     }
2026
2027   emit_insn (gen (dst, src1, src2));
2028   return;
2029 }
2030
2031 /* Split a complex SIMD move.  */
2032
2033 void
2034 aarch64_split_simd_move (rtx dst, rtx src)
2035 {
2036   machine_mode src_mode = GET_MODE (src);
2037   machine_mode dst_mode = GET_MODE (dst);
2038
2039   gcc_assert (VECTOR_MODE_P (dst_mode));
2040
2041   if (REG_P (dst) && REG_P (src))
2042     {
2043       rtx (*gen) (rtx, rtx);
2044
2045       gcc_assert (VECTOR_MODE_P (src_mode));
2046
2047       switch (src_mode)
2048         {
2049         case E_V16QImode:
2050           gen = gen_aarch64_split_simd_movv16qi;
2051           break;
2052         case E_V8HImode:
2053           gen = gen_aarch64_split_simd_movv8hi;
2054           break;
2055         case E_V4SImode:
2056           gen = gen_aarch64_split_simd_movv4si;
2057           break;
2058         case E_V2DImode:
2059           gen = gen_aarch64_split_simd_movv2di;
2060           break;
2061         case E_V8HFmode:
2062           gen = gen_aarch64_split_simd_movv8hf;
2063           break;
2064         case E_V4SFmode:
2065           gen = gen_aarch64_split_simd_movv4sf;
2066           break;
2067         case E_V2DFmode:
2068           gen = gen_aarch64_split_simd_movv2df;
2069           break;
2070         default:
2071           gcc_unreachable ();
2072         }
2073
2074       emit_insn (gen (dst, src));
2075       return;
2076     }
2077 }
2078
2079 bool
2080 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2081                               machine_mode ymode, rtx y)
2082 {
2083   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2084   gcc_assert (r != NULL);
2085   return rtx_equal_p (x, r);
2086 }
2087
2088
2089 static rtx
2090 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2091 {
2092   if (can_create_pseudo_p ())
2093     return force_reg (mode, value);
2094   else
2095     {
2096       gcc_assert (x);
2097       aarch64_emit_move (x, value);
2098       return x;
2099     }
2100 }
2101
2102 /* Return true if we can move VALUE into a register using a single
2103    CNT[BHWD] instruction.  */
2104
2105 static bool
2106 aarch64_sve_cnt_immediate_p (poly_int64 value)
2107 {
2108   HOST_WIDE_INT factor = value.coeffs[0];
2109   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2110   return (value.coeffs[1] == factor
2111           && IN_RANGE (factor, 2, 16 * 16)
2112           && (factor & 1) == 0
2113           && factor <= 16 * (factor & -factor));
2114 }
2115
2116 /* Likewise for rtx X.  */
2117
2118 bool
2119 aarch64_sve_cnt_immediate_p (rtx x)
2120 {
2121   poly_int64 value;
2122   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2123 }
2124
2125 /* Return the asm string for an instruction with a CNT-like vector size
2126    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2127    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2128    first part of the operands template (the part that comes before the
2129    vector size itself).  FACTOR is the number of quadwords.
2130    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2131    If it is zero, we can use any element size.  */
2132
2133 static char *
2134 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2135                                   unsigned int factor,
2136                                   unsigned int nelts_per_vq)
2137 {
2138   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2139
2140   if (nelts_per_vq == 0)
2141     /* There is some overlap in the ranges of the four CNT instructions.
2142        Here we always use the smallest possible element size, so that the
2143        multiplier is 1 whereever possible.  */
2144     nelts_per_vq = factor & -factor;
2145   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2146   gcc_assert (IN_RANGE (shift, 1, 4));
2147   char suffix = "dwhb"[shift - 1];
2148
2149   factor >>= shift;
2150   unsigned int written;
2151   if (factor == 1)
2152     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2153                         prefix, suffix, operands);
2154   else
2155     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2156                         prefix, suffix, operands, factor);
2157   gcc_assert (written < sizeof (buffer));
2158   return buffer;
2159 }
2160
2161 /* Return the asm string for an instruction with a CNT-like vector size
2162    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2163    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2164    first part of the operands template (the part that comes before the
2165    vector size itself).  X is the value of the vector size operand,
2166    as a polynomial integer rtx.  */
2167
2168 char *
2169 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2170                                   rtx x)
2171 {
2172   poly_int64 value = rtx_to_poly_int64 (x);
2173   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2174   return aarch64_output_sve_cnt_immediate (prefix, operands,
2175                                            value.coeffs[1], 0);
2176 }
2177
2178 /* Return true if we can add VALUE to a register using a single ADDVL
2179    or ADDPL instruction.  */
2180
2181 static bool
2182 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2183 {
2184   HOST_WIDE_INT factor = value.coeffs[0];
2185   if (factor == 0 || value.coeffs[1] != factor)
2186     return false;
2187   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2188      and a value of 16 is one vector width.  */
2189   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2190           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2191 }
2192
2193 /* Likewise for rtx X.  */
2194
2195 bool
2196 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2197 {
2198   poly_int64 value;
2199   return (poly_int_rtx_p (x, &value)
2200           && aarch64_sve_addvl_addpl_immediate_p (value));
2201 }
2202
2203 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2204    and storing the result in operand 0.  */
2205
2206 char *
2207 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2208 {
2209   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2210   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2211   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2212
2213   /* Use INC or DEC if possible.  */
2214   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2215     {
2216       if (aarch64_sve_cnt_immediate_p (offset_value))
2217         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2218                                                  offset_value.coeffs[1], 0);
2219       if (aarch64_sve_cnt_immediate_p (-offset_value))
2220         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2221                                                  -offset_value.coeffs[1], 0);
2222     }
2223
2224   int factor = offset_value.coeffs[1];
2225   if ((factor & 15) == 0)
2226     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2227   else
2228     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2229   return buffer;
2230 }
2231
2232 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2233    instruction.  If it is, store the number of elements in each vector
2234    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2235    factor in *FACTOR_OUT (if nonnull).  */
2236
2237 bool
2238 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2239                                  unsigned int *nelts_per_vq_out)
2240 {
2241   rtx elt;
2242   poly_int64 value;
2243
2244   if (!const_vec_duplicate_p (x, &elt)
2245       || !poly_int_rtx_p (elt, &value))
2246     return false;
2247
2248   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2249   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2250     /* There's no vector INCB.  */
2251     return false;
2252
2253   HOST_WIDE_INT factor = value.coeffs[0];
2254   if (value.coeffs[1] != factor)
2255     return false;
2256
2257   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2258   if ((factor % nelts_per_vq) != 0
2259       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2260     return false;
2261
2262   if (factor_out)
2263     *factor_out = factor;
2264   if (nelts_per_vq_out)
2265     *nelts_per_vq_out = nelts_per_vq;
2266   return true;
2267 }
2268
2269 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2270    instruction.  */
2271
2272 bool
2273 aarch64_sve_inc_dec_immediate_p (rtx x)
2274 {
2275   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2276 }
2277
2278 /* Return the asm template for an SVE vector INC or DEC instruction.
2279    OPERANDS gives the operands before the vector count and X is the
2280    value of the vector count operand itself.  */
2281
2282 char *
2283 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2284 {
2285   int factor;
2286   unsigned int nelts_per_vq;
2287   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2288     gcc_unreachable ();
2289   if (factor < 0)
2290     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2291                                              nelts_per_vq);
2292   else
2293     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2294                                              nelts_per_vq);
2295 }
2296
2297 static int
2298 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2299                                 scalar_int_mode mode)
2300 {
2301   int i;
2302   unsigned HOST_WIDE_INT val, val2, mask;
2303   int one_match, zero_match;
2304   int num_insns;
2305
2306   val = INTVAL (imm);
2307
2308   if (aarch64_move_imm (val, mode))
2309     {
2310       if (generate)
2311         emit_insn (gen_rtx_SET (dest, imm));
2312       return 1;
2313     }
2314
2315   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2316      (with XXXX non-zero). In that case check to see if the move can be done in
2317      a smaller mode.  */
2318   val2 = val & 0xffffffff;
2319   if (mode == DImode
2320       && aarch64_move_imm (val2, SImode)
2321       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2322     {
2323       if (generate)
2324         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2325
2326       /* Check if we have to emit a second instruction by checking to see
2327          if any of the upper 32 bits of the original DI mode value is set.  */
2328       if (val == val2)
2329         return 1;
2330
2331       i = (val >> 48) ? 48 : 32;
2332
2333       if (generate)
2334          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2335                                     GEN_INT ((val >> i) & 0xffff)));
2336
2337       return 2;
2338     }
2339
2340   if ((val >> 32) == 0 || mode == SImode)
2341     {
2342       if (generate)
2343         {
2344           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2345           if (mode == SImode)
2346             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2347                                        GEN_INT ((val >> 16) & 0xffff)));
2348           else
2349             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2350                                        GEN_INT ((val >> 16) & 0xffff)));
2351         }
2352       return 2;
2353     }
2354
2355   /* Remaining cases are all for DImode.  */
2356
2357   mask = 0xffff;
2358   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2359     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2360   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2361     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2362
2363   if (zero_match != 2 && one_match != 2)
2364     {
2365       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2366          For a 64-bit bitmask try whether changing 16 bits to all ones or
2367          zeroes creates a valid bitmask.  To check any repeated bitmask,
2368          try using 16 bits from the other 32-bit half of val.  */
2369
2370       for (i = 0; i < 64; i += 16, mask <<= 16)
2371         {
2372           val2 = val & ~mask;
2373           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2374             break;
2375           val2 = val | mask;
2376           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2377             break;
2378           val2 = val2 & ~mask;
2379           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2380           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2381             break;
2382         }
2383       if (i != 64)
2384         {
2385           if (generate)
2386             {
2387               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2388               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2389                                          GEN_INT ((val >> i) & 0xffff)));
2390             }
2391           return 2;
2392         }
2393     }
2394
2395   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2396      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2397      otherwise skip zero bits.  */
2398
2399   num_insns = 1;
2400   mask = 0xffff;
2401   val2 = one_match > zero_match ? ~val : val;
2402   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2403
2404   if (generate)
2405     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2406                                            ? (val | ~(mask << i))
2407                                            : (val & (mask << i)))));
2408   for (i += 16; i < 64; i += 16)
2409     {
2410       if ((val2 & (mask << i)) == 0)
2411         continue;
2412       if (generate)
2413         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2414                                    GEN_INT ((val >> i) & 0xffff)));
2415       num_insns ++;
2416     }
2417
2418   return num_insns;
2419 }
2420
2421 /* Return whether imm is a 128-bit immediate which is simple enough to
2422    expand inline.  */
2423 bool
2424 aarch64_mov128_immediate (rtx imm)
2425 {
2426   if (GET_CODE (imm) == CONST_INT)
2427     return true;
2428
2429   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2430
2431   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2432   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2433
2434   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2435          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2436 }
2437
2438
2439 /* Return the number of temporary registers that aarch64_add_offset_1
2440    would need to add OFFSET to a register.  */
2441
2442 static unsigned int
2443 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2444 {
2445   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2446 }
2447
2448 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2449    a non-polynomial OFFSET.  MODE is the mode of the addition.
2450    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2451    be set and CFA adjustments added to the generated instructions.
2452
2453    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2454    temporary if register allocation is already complete.  This temporary
2455    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2456    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2457    the immediate again.
2458
2459    Since this function may be used to adjust the stack pointer, we must
2460    ensure that it cannot cause transient stack deallocation (for example
2461    by first incrementing SP and then decrementing when adjusting by a
2462    large immediate).  */
2463
2464 static void
2465 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2466                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2467                       bool frame_related_p, bool emit_move_imm)
2468 {
2469   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2470   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2471
2472   HOST_WIDE_INT moffset = abs_hwi (offset);
2473   rtx_insn *insn;
2474
2475   if (!moffset)
2476     {
2477       if (!rtx_equal_p (dest, src))
2478         {
2479           insn = emit_insn (gen_rtx_SET (dest, src));
2480           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2481         }
2482       return;
2483     }
2484
2485   /* Single instruction adjustment.  */
2486   if (aarch64_uimm12_shift (moffset))
2487     {
2488       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2489       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2490       return;
2491     }
2492
2493   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2494      and either:
2495
2496      a) the offset cannot be loaded by a 16-bit move or
2497      b) there is no spare register into which we can move it.  */
2498   if (moffset < 0x1000000
2499       && ((!temp1 && !can_create_pseudo_p ())
2500           || !aarch64_move_imm (moffset, mode)))
2501     {
2502       HOST_WIDE_INT low_off = moffset & 0xfff;
2503
2504       low_off = offset < 0 ? -low_off : low_off;
2505       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2506       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2507       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2508       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2509       return;
2510     }
2511
2512   /* Emit a move immediate if required and an addition/subtraction.  */
2513   if (emit_move_imm)
2514     {
2515       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2516       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2517     }
2518   insn = emit_insn (offset < 0
2519                     ? gen_sub3_insn (dest, src, temp1)
2520                     : gen_add3_insn (dest, src, temp1));
2521   if (frame_related_p)
2522     {
2523       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2524       rtx adj = plus_constant (mode, src, offset);
2525       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2526     }
2527 }
2528
2529 /* Return the number of temporary registers that aarch64_add_offset
2530    would need to move OFFSET into a register or add OFFSET to a register;
2531    ADD_P is true if we want the latter rather than the former.  */
2532
2533 static unsigned int
2534 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2535 {
2536   /* This follows the same structure as aarch64_add_offset.  */
2537   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2538     return 0;
2539
2540   unsigned int count = 0;
2541   HOST_WIDE_INT factor = offset.coeffs[1];
2542   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2543   poly_int64 poly_offset (factor, factor);
2544   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2545     /* Need one register for the ADDVL/ADDPL result.  */
2546     count += 1;
2547   else if (factor != 0)
2548     {
2549       factor = abs (factor);
2550       if (factor > 16 * (factor & -factor))
2551         /* Need one register for the CNT result and one for the multiplication
2552            factor.  If necessary, the second temporary can be reused for the
2553            constant part of the offset.  */
2554         return 2;
2555       /* Need one register for the CNT result (which might then
2556          be shifted).  */
2557       count += 1;
2558     }
2559   return count + aarch64_add_offset_1_temporaries (constant);
2560 }
2561
2562 /* If X can be represented as a poly_int64, return the number
2563    of temporaries that are required to add it to a register.
2564    Return -1 otherwise.  */
2565
2566 int
2567 aarch64_add_offset_temporaries (rtx x)
2568 {
2569   poly_int64 offset;
2570   if (!poly_int_rtx_p (x, &offset))
2571     return -1;
2572   return aarch64_offset_temporaries (true, offset);
2573 }
2574
2575 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2576    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2577    be set and CFA adjustments added to the generated instructions.
2578
2579    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2580    temporary if register allocation is already complete.  This temporary
2581    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2582    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2583    false to avoid emitting the immediate again.
2584
2585    TEMP2, if nonnull, is a second temporary register that doesn't
2586    overlap either DEST or REG.
2587
2588    Since this function may be used to adjust the stack pointer, we must
2589    ensure that it cannot cause transient stack deallocation (for example
2590    by first incrementing SP and then decrementing when adjusting by a
2591    large immediate).  */
2592
2593 static void
2594 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2595                     poly_int64 offset, rtx temp1, rtx temp2,
2596                     bool frame_related_p, bool emit_move_imm = true)
2597 {
2598   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2599   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2600   gcc_assert (temp1 == NULL_RTX
2601               || !frame_related_p
2602               || !reg_overlap_mentioned_p (temp1, dest));
2603   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2604
2605   /* Try using ADDVL or ADDPL to add the whole value.  */
2606   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2607     {
2608       rtx offset_rtx = gen_int_mode (offset, mode);
2609       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2610       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2611       return;
2612     }
2613
2614   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2615      SVE vector register, over and above the minimum size of 128 bits.
2616      This is equivalent to half the value returned by CNTD with a
2617      vector shape of ALL.  */
2618   HOST_WIDE_INT factor = offset.coeffs[1];
2619   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2620
2621   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2622   poly_int64 poly_offset (factor, factor);
2623   if (src != const0_rtx
2624       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2625     {
2626       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2627       if (frame_related_p)
2628         {
2629           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2630           RTX_FRAME_RELATED_P (insn) = true;
2631           src = dest;
2632         }
2633       else
2634         {
2635           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2636           src = aarch64_force_temporary (mode, temp1, addr);
2637           temp1 = temp2;
2638           temp2 = NULL_RTX;
2639         }
2640     }
2641   /* Otherwise use a CNT-based sequence.  */
2642   else if (factor != 0)
2643     {
2644       /* Use a subtraction if we have a negative factor.  */
2645       rtx_code code = PLUS;
2646       if (factor < 0)
2647         {
2648           factor = -factor;
2649           code = MINUS;
2650         }
2651
2652       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2653          into the multiplication.  */
2654       rtx val;
2655       int shift = 0;
2656       if (factor & 1)
2657         /* Use a right shift by 1.  */
2658         shift = -1;
2659       else
2660         factor /= 2;
2661       HOST_WIDE_INT low_bit = factor & -factor;
2662       if (factor <= 16 * low_bit)
2663         {
2664           if (factor > 16 * 8)
2665             {
2666               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2667                  the value with the minimum multiplier and shift it into
2668                  position.  */
2669               int extra_shift = exact_log2 (low_bit);
2670               shift += extra_shift;
2671               factor >>= extra_shift;
2672             }
2673           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2674         }
2675       else
2676         {
2677           /* Use CNTD, then multiply it by FACTOR.  */
2678           val = gen_int_mode (poly_int64 (2, 2), mode);
2679           val = aarch64_force_temporary (mode, temp1, val);
2680
2681           /* Go back to using a negative multiplication factor if we have
2682              no register from which to subtract.  */
2683           if (code == MINUS && src == const0_rtx)
2684             {
2685               factor = -factor;
2686               code = PLUS;
2687             }
2688           rtx coeff1 = gen_int_mode (factor, mode);
2689           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2690           val = gen_rtx_MULT (mode, val, coeff1);
2691         }
2692
2693       if (shift > 0)
2694         {
2695           /* Multiply by 1 << SHIFT.  */
2696           val = aarch64_force_temporary (mode, temp1, val);
2697           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2698         }
2699       else if (shift == -1)
2700         {
2701           /* Divide by 2.  */
2702           val = aarch64_force_temporary (mode, temp1, val);
2703           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2704         }
2705
2706       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2707       if (src != const0_rtx)
2708         {
2709           val = aarch64_force_temporary (mode, temp1, val);
2710           val = gen_rtx_fmt_ee (code, mode, src, val);
2711         }
2712       else if (code == MINUS)
2713         {
2714           val = aarch64_force_temporary (mode, temp1, val);
2715           val = gen_rtx_NEG (mode, val);
2716         }
2717
2718       if (constant == 0 || frame_related_p)
2719         {
2720           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2721           if (frame_related_p)
2722             {
2723               RTX_FRAME_RELATED_P (insn) = true;
2724               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2725                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2726                                                               poly_offset)));
2727             }
2728           src = dest;
2729           if (constant == 0)
2730             return;
2731         }
2732       else
2733         {
2734           src = aarch64_force_temporary (mode, temp1, val);
2735           temp1 = temp2;
2736           temp2 = NULL_RTX;
2737         }
2738
2739       emit_move_imm = true;
2740     }
2741
2742   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2743                         frame_related_p, emit_move_imm);
2744 }
2745
2746 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2747    than a poly_int64.  */
2748
2749 void
2750 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2751                           rtx offset_rtx, rtx temp1, rtx temp2)
2752 {
2753   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2754                       temp1, temp2, false);
2755 }
2756
2757 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2758    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2759    if TEMP1 already contains abs (DELTA).  */
2760
2761 static inline void
2762 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2763 {
2764   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2765                       temp1, temp2, true, emit_move_imm);
2766 }
2767
2768 /* Subtract DELTA from the stack pointer, marking the instructions
2769    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2770    if nonnull.  */
2771
2772 static inline void
2773 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2774 {
2775   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2776                       temp1, temp2, frame_related_p);
2777 }
2778
2779 /* Set DEST to (vec_series BASE STEP).  */
2780
2781 static void
2782 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2783 {
2784   machine_mode mode = GET_MODE (dest);
2785   scalar_mode inner = GET_MODE_INNER (mode);
2786
2787   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2788   if (!aarch64_sve_index_immediate_p (base))
2789     base = force_reg (inner, base);
2790   if (!aarch64_sve_index_immediate_p (step))
2791     step = force_reg (inner, step);
2792
2793   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2794 }
2795
2796 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2797    integer of mode INT_MODE.  Return true on success.  */
2798
2799 static bool
2800 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2801                                       rtx src)
2802 {
2803   /* If the constant is smaller than 128 bits, we can do the move
2804      using a vector of SRC_MODEs.  */
2805   if (src_mode != TImode)
2806     {
2807       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2808                                      GET_MODE_SIZE (src_mode));
2809       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2810       emit_move_insn (gen_lowpart (dup_mode, dest),
2811                       gen_const_vec_duplicate (dup_mode, src));
2812       return true;
2813     }
2814
2815   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2816   src = force_const_mem (src_mode, src);
2817   if (!src)
2818     return false;
2819
2820   /* Make sure that the address is legitimate.  */
2821   if (!aarch64_sve_ld1r_operand_p (src))
2822     {
2823       rtx addr = force_reg (Pmode, XEXP (src, 0));
2824       src = replace_equiv_address (src, addr);
2825     }
2826
2827   machine_mode mode = GET_MODE (dest);
2828   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2829   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2830   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2831   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2832   emit_insn (gen_rtx_SET (dest, src));
2833   return true;
2834 }
2835
2836 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2837    isn't a simple duplicate or series.  */
2838
2839 static void
2840 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2841 {
2842   machine_mode mode = GET_MODE (src);
2843   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2844   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2845   gcc_assert (npatterns > 1);
2846
2847   if (nelts_per_pattern == 1)
2848     {
2849       /* The constant is a repeating seqeuence of at least two elements,
2850          where the repeating elements occupy no more than 128 bits.
2851          Get an integer representation of the replicated value.  */
2852       scalar_int_mode int_mode;
2853       if (BYTES_BIG_ENDIAN)
2854         /* For now, always use LD1RQ to load the value on big-endian
2855            targets, since the handling of smaller integers includes a
2856            subreg that is semantically an element reverse.  */
2857         int_mode = TImode;
2858       else
2859         {
2860           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2861           gcc_assert (int_bits <= 128);
2862           int_mode = int_mode_for_size (int_bits, 0).require ();
2863         }
2864       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2865       if (int_value
2866           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2867         return;
2868     }
2869
2870   /* Expand each pattern individually.  */
2871   rtx_vector_builder builder;
2872   auto_vec<rtx, 16> vectors (npatterns);
2873   for (unsigned int i = 0; i < npatterns; ++i)
2874     {
2875       builder.new_vector (mode, 1, nelts_per_pattern);
2876       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2877         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2878       vectors.quick_push (force_reg (mode, builder.build ()));
2879     }
2880
2881   /* Use permutes to interleave the separate vectors.  */
2882   while (npatterns > 1)
2883     {
2884       npatterns /= 2;
2885       for (unsigned int i = 0; i < npatterns; ++i)
2886         {
2887           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2888           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2889           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2890           vectors[i] = tmp;
2891         }
2892     }
2893   gcc_assert (vectors[0] == dest);
2894 }
2895
2896 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2897    is a pattern that can be used to set DEST to a replicated scalar
2898    element.  */
2899
2900 void
2901 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2902                               rtx (*gen_vec_duplicate) (rtx, rtx))
2903 {
2904   machine_mode mode = GET_MODE (dest);
2905
2906   /* Check on what type of symbol it is.  */
2907   scalar_int_mode int_mode;
2908   if ((GET_CODE (imm) == SYMBOL_REF
2909        || GET_CODE (imm) == LABEL_REF
2910        || GET_CODE (imm) == CONST
2911        || GET_CODE (imm) == CONST_POLY_INT)
2912       && is_a <scalar_int_mode> (mode, &int_mode))
2913     {
2914       rtx mem;
2915       poly_int64 offset;
2916       HOST_WIDE_INT const_offset;
2917       enum aarch64_symbol_type sty;
2918
2919       /* If we have (const (plus symbol offset)), separate out the offset
2920          before we start classifying the symbol.  */
2921       rtx base = strip_offset (imm, &offset);
2922
2923       /* We must always add an offset involving VL separately, rather than
2924          folding it into the relocation.  */
2925       if (!offset.is_constant (&const_offset))
2926         {
2927           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2928             emit_insn (gen_rtx_SET (dest, imm));
2929           else
2930             {
2931               /* Do arithmetic on 32-bit values if the result is smaller
2932                  than that.  */
2933               if (partial_subreg_p (int_mode, SImode))
2934                 {
2935                   /* It is invalid to do symbol calculations in modes
2936                      narrower than SImode.  */
2937                   gcc_assert (base == const0_rtx);
2938                   dest = gen_lowpart (SImode, dest);
2939                   int_mode = SImode;
2940                 }
2941               if (base != const0_rtx)
2942                 {
2943                   base = aarch64_force_temporary (int_mode, dest, base);
2944                   aarch64_add_offset (int_mode, dest, base, offset,
2945                                       NULL_RTX, NULL_RTX, false);
2946                 }
2947               else
2948                 aarch64_add_offset (int_mode, dest, base, offset,
2949                                     dest, NULL_RTX, false);
2950             }
2951           return;
2952         }
2953
2954       sty = aarch64_classify_symbol (base, const_offset);
2955       switch (sty)
2956         {
2957         case SYMBOL_FORCE_TO_MEM:
2958           if (const_offset != 0
2959               && targetm.cannot_force_const_mem (int_mode, imm))
2960             {
2961               gcc_assert (can_create_pseudo_p ());
2962               base = aarch64_force_temporary (int_mode, dest, base);
2963               aarch64_add_offset (int_mode, dest, base, const_offset,
2964                                   NULL_RTX, NULL_RTX, false);
2965               return;
2966             }
2967
2968           mem = force_const_mem (ptr_mode, imm);
2969           gcc_assert (mem);
2970
2971           /* If we aren't generating PC relative literals, then
2972              we need to expand the literal pool access carefully.
2973              This is something that needs to be done in a number
2974              of places, so could well live as a separate function.  */
2975           if (!aarch64_pcrelative_literal_loads)
2976             {
2977               gcc_assert (can_create_pseudo_p ());
2978               base = gen_reg_rtx (ptr_mode);
2979               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2980               if (ptr_mode != Pmode)
2981                 base = convert_memory_address (Pmode, base);
2982               mem = gen_rtx_MEM (ptr_mode, base);
2983             }
2984
2985           if (int_mode != ptr_mode)
2986             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2987
2988           emit_insn (gen_rtx_SET (dest, mem));
2989
2990           return;
2991
2992         case SYMBOL_SMALL_TLSGD:
2993         case SYMBOL_SMALL_TLSDESC:
2994         case SYMBOL_SMALL_TLSIE:
2995         case SYMBOL_SMALL_GOT_28K:
2996         case SYMBOL_SMALL_GOT_4G:
2997         case SYMBOL_TINY_GOT:
2998         case SYMBOL_TINY_TLSIE:
2999           if (const_offset != 0)
3000             {
3001               gcc_assert(can_create_pseudo_p ());
3002               base = aarch64_force_temporary (int_mode, dest, base);
3003               aarch64_add_offset (int_mode, dest, base, const_offset,
3004                                   NULL_RTX, NULL_RTX, false);
3005               return;
3006             }
3007           /* FALLTHRU */
3008
3009         case SYMBOL_SMALL_ABSOLUTE:
3010         case SYMBOL_TINY_ABSOLUTE:
3011         case SYMBOL_TLSLE12:
3012         case SYMBOL_TLSLE24:
3013         case SYMBOL_TLSLE32:
3014         case SYMBOL_TLSLE48:
3015           aarch64_load_symref_appropriately (dest, imm, sty);
3016           return;
3017
3018         default:
3019           gcc_unreachable ();
3020         }
3021     }
3022
3023   if (!CONST_INT_P (imm))
3024     {
3025       rtx base, step, value;
3026       if (GET_CODE (imm) == HIGH
3027           || aarch64_simd_valid_immediate (imm, NULL))
3028         emit_insn (gen_rtx_SET (dest, imm));
3029       else if (const_vec_series_p (imm, &base, &step))
3030         aarch64_expand_vec_series (dest, base, step);
3031       else if (const_vec_duplicate_p (imm, &value))
3032         {
3033           /* If the constant is out of range of an SVE vector move,
3034              load it from memory if we can, otherwise move it into
3035              a register and use a DUP.  */
3036           scalar_mode inner_mode = GET_MODE_INNER (mode);
3037           rtx op = force_const_mem (inner_mode, value);
3038           if (!op)
3039             op = force_reg (inner_mode, value);
3040           else if (!aarch64_sve_ld1r_operand_p (op))
3041             {
3042               rtx addr = force_reg (Pmode, XEXP (op, 0));
3043               op = replace_equiv_address (op, addr);
3044             }
3045           emit_insn (gen_vec_duplicate (dest, op));
3046         }
3047       else if (GET_CODE (imm) == CONST_VECTOR
3048                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3049         aarch64_expand_sve_const_vector (dest, imm);
3050       else
3051         {
3052           rtx mem = force_const_mem (mode, imm);
3053           gcc_assert (mem);
3054           emit_move_insn (dest, mem);
3055         }
3056
3057       return;
3058     }
3059
3060   aarch64_internal_mov_immediate (dest, imm, true,
3061                                   as_a <scalar_int_mode> (mode));
3062 }
3063
3064 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3065    that is known to contain PTRUE.  */
3066
3067 void
3068 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3069 {
3070   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3071                                                 gen_rtvec (2, pred, src),
3072                                                 UNSPEC_MERGE_PTRUE)));
3073 }
3074
3075 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3076    operand is in memory.  In this case we need to use the predicated LD1
3077    and ST1 instead of LDR and STR, both for correctness on big-endian
3078    targets and because LD1 and ST1 support a wider range of addressing modes.
3079    PRED_MODE is the mode of the predicate.
3080
3081    See the comment at the head of aarch64-sve.md for details about the
3082    big-endian handling.  */
3083
3084 void
3085 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3086 {
3087   machine_mode mode = GET_MODE (dest);
3088   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3089   if (!register_operand (src, mode)
3090       && !register_operand (dest, mode))
3091     {
3092       rtx tmp = gen_reg_rtx (mode);
3093       if (MEM_P (src))
3094         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3095       else
3096         emit_move_insn (tmp, src);
3097       src = tmp;
3098     }
3099   aarch64_emit_sve_pred_move (dest, ptrue, src);
3100 }
3101
3102 /* Called only on big-endian targets.  See whether an SVE vector move
3103    from SRC to DEST is effectively a REV[BHW] instruction, because at
3104    least one operand is a subreg of an SVE vector that has wider or
3105    narrower elements.  Return true and emit the instruction if so.
3106
3107    For example:
3108
3109      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3110
3111    represents a VIEW_CONVERT between the following vectors, viewed
3112    in memory order:
3113
3114      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3115      R1: { [0],      [1],      [2],      [3],     ... }
3116
3117    The high part of lane X in R2 should therefore correspond to lane X*2
3118    of R1, but the register representations are:
3119
3120          msb                                      lsb
3121      R2: ...... [1].high  [1].low   [0].high  [0].low
3122      R1: ...... [3]       [2]       [1]       [0]
3123
3124    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3125    We therefore need a reverse operation to swap the high and low values
3126    around.
3127
3128    This is purely an optimization.  Without it we would spill the
3129    subreg operand to the stack in one mode and reload it in the
3130    other mode, which has the same effect as the REV.  */
3131
3132 bool
3133 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3134 {
3135   gcc_assert (BYTES_BIG_ENDIAN);
3136   if (GET_CODE (dest) == SUBREG)
3137     dest = SUBREG_REG (dest);
3138   if (GET_CODE (src) == SUBREG)
3139     src = SUBREG_REG (src);
3140
3141   /* The optimization handles two single SVE REGs with different element
3142      sizes.  */
3143   if (!REG_P (dest)
3144       || !REG_P (src)
3145       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3146       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3147       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3148           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3149     return false;
3150
3151   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3152   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3153   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3154                                UNSPEC_REV_SUBREG);
3155   emit_insn (gen_rtx_SET (dest, unspec));
3156   return true;
3157 }
3158
3159 /* Return a copy of X with mode MODE, without changing its other
3160    attributes.  Unlike gen_lowpart, this doesn't care whether the
3161    mode change is valid.  */
3162
3163 static rtx
3164 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3165 {
3166   if (GET_MODE (x) == mode)
3167     return x;
3168
3169   x = shallow_copy_rtx (x);
3170   set_mode_and_regno (x, mode, REGNO (x));
3171   return x;
3172 }
3173
3174 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3175    operands.  */
3176
3177 void
3178 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3179 {
3180   /* Decide which REV operation we need.  The mode with narrower elements
3181      determines the mode of the operands and the mode with the wider
3182      elements determines the reverse width.  */
3183   machine_mode mode_with_wider_elts = GET_MODE (dest);
3184   machine_mode mode_with_narrower_elts = GET_MODE (src);
3185   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3186       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3187     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3188
3189   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3190   unsigned int unspec;
3191   if (wider_bytes == 8)
3192     unspec = UNSPEC_REV64;
3193   else if (wider_bytes == 4)
3194     unspec = UNSPEC_REV32;
3195   else if (wider_bytes == 2)
3196     unspec = UNSPEC_REV16;
3197   else
3198     gcc_unreachable ();
3199   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3200
3201   /* Emit:
3202
3203        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3204                          UNSPEC_MERGE_PTRUE))
3205
3206      with the appropriate modes.  */
3207   ptrue = gen_lowpart (pred_mode, ptrue);
3208   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3209   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3210   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3211   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3212                         UNSPEC_MERGE_PTRUE);
3213   emit_insn (gen_rtx_SET (dest, src));
3214 }
3215
3216 static bool
3217 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3218                                  tree exp ATTRIBUTE_UNUSED)
3219 {
3220   /* Currently, always true.  */
3221   return true;
3222 }
3223
3224 /* Implement TARGET_PASS_BY_REFERENCE.  */
3225
3226 static bool
3227 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3228                            machine_mode mode,
3229                            const_tree type,
3230                            bool named ATTRIBUTE_UNUSED)
3231 {
3232   HOST_WIDE_INT size;
3233   machine_mode dummymode;
3234   int nregs;
3235
3236   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3237   if (mode == BLKmode && type)
3238     size = int_size_in_bytes (type);
3239   else
3240     /* No frontends can create types with variable-sized modes, so we
3241        shouldn't be asked to pass or return them.  */
3242     size = GET_MODE_SIZE (mode).to_constant ();
3243
3244   /* Aggregates are passed by reference based on their size.  */
3245   if (type && AGGREGATE_TYPE_P (type))
3246     {
3247       size = int_size_in_bytes (type);
3248     }
3249
3250   /* Variable sized arguments are always returned by reference.  */
3251   if (size < 0)
3252     return true;
3253
3254   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3255   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3256                                                &dummymode, &nregs,
3257                                                NULL))
3258     return false;
3259
3260   /* Arguments which are variable sized or larger than 2 registers are
3261      passed by reference unless they are a homogenous floating point
3262      aggregate.  */
3263   return size > 2 * UNITS_PER_WORD;
3264 }
3265
3266 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3267 static bool
3268 aarch64_return_in_msb (const_tree valtype)
3269 {
3270   machine_mode dummy_mode;
3271   int dummy_int;
3272
3273   /* Never happens in little-endian mode.  */
3274   if (!BYTES_BIG_ENDIAN)
3275     return false;
3276
3277   /* Only composite types smaller than or equal to 16 bytes can
3278      be potentially returned in registers.  */
3279   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3280       || int_size_in_bytes (valtype) <= 0
3281       || int_size_in_bytes (valtype) > 16)
3282     return false;
3283
3284   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3285      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3286      is always passed/returned in the least significant bits of fp/simd
3287      register(s).  */
3288   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3289                                                &dummy_mode, &dummy_int, NULL))
3290     return false;
3291
3292   return true;
3293 }
3294
3295 /* Implement TARGET_FUNCTION_VALUE.
3296    Define how to find the value returned by a function.  */
3297
3298 static rtx
3299 aarch64_function_value (const_tree type, const_tree func,
3300                         bool outgoing ATTRIBUTE_UNUSED)
3301 {
3302   machine_mode mode;
3303   int unsignedp;
3304   int count;
3305   machine_mode ag_mode;
3306
3307   mode = TYPE_MODE (type);
3308   if (INTEGRAL_TYPE_P (type))
3309     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3310
3311   if (aarch64_return_in_msb (type))
3312     {
3313       HOST_WIDE_INT size = int_size_in_bytes (type);
3314
3315       if (size % UNITS_PER_WORD != 0)
3316         {
3317           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3318           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3319         }
3320     }
3321
3322   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3323                                                &ag_mode, &count, NULL))
3324     {
3325       if (!aarch64_composite_type_p (type, mode))
3326         {
3327           gcc_assert (count == 1 && mode == ag_mode);
3328           return gen_rtx_REG (mode, V0_REGNUM);
3329         }
3330       else
3331         {
3332           int i;
3333           rtx par;
3334
3335           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3336           for (i = 0; i < count; i++)
3337             {
3338               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3339               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3340               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3341               XVECEXP (par, 0, i) = tmp;
3342             }
3343           return par;
3344         }
3345     }
3346   else
3347     return gen_rtx_REG (mode, R0_REGNUM);
3348 }
3349
3350 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3351    Return true if REGNO is the number of a hard register in which the values
3352    of called function may come back.  */
3353
3354 static bool
3355 aarch64_function_value_regno_p (const unsigned int regno)
3356 {
3357   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3358      of 16-byte return values are: 128-bit integers and 16-byte small
3359      structures (excluding homogeneous floating-point aggregates).  */
3360   if (regno == R0_REGNUM || regno == R1_REGNUM)
3361     return true;
3362
3363   /* Up to four fp/simd registers can return a function value, e.g. a
3364      homogeneous floating-point aggregate having four members.  */
3365   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3366     return TARGET_FLOAT;
3367
3368   return false;
3369 }
3370
3371 /* Implement TARGET_RETURN_IN_MEMORY.
3372
3373    If the type T of the result of a function is such that
3374      void func (T arg)
3375    would require that arg be passed as a value in a register (or set of
3376    registers) according to the parameter passing rules, then the result
3377    is returned in the same registers as would be used for such an
3378    argument.  */
3379
3380 static bool
3381 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3382 {
3383   HOST_WIDE_INT size;
3384   machine_mode ag_mode;
3385   int count;
3386
3387   if (!AGGREGATE_TYPE_P (type)
3388       && TREE_CODE (type) != COMPLEX_TYPE
3389       && TREE_CODE (type) != VECTOR_TYPE)
3390     /* Simple scalar types always returned in registers.  */
3391     return false;
3392
3393   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3394                                                type,
3395                                                &ag_mode,
3396                                                &count,
3397                                                NULL))
3398     return false;
3399
3400   /* Types larger than 2 registers returned in memory.  */
3401   size = int_size_in_bytes (type);
3402   return (size < 0 || size > 2 * UNITS_PER_WORD);
3403 }
3404
3405 static bool
3406 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3407                                const_tree type, int *nregs)
3408 {
3409   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3410   return aarch64_vfp_is_call_or_return_candidate (mode,
3411                                                   type,
3412                                                   &pcum->aapcs_vfp_rmode,
3413                                                   nregs,
3414                                                   NULL);
3415 }
3416
3417 /* Given MODE and TYPE of a function argument, return the alignment in
3418    bits.  The idea is to suppress any stronger alignment requested by
3419    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3420    This is a helper function for local use only.  */
3421
3422 static unsigned int
3423 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3424 {
3425   if (!type)
3426     return GET_MODE_ALIGNMENT (mode);
3427
3428   if (integer_zerop (TYPE_SIZE (type)))
3429     return 0;
3430
3431   gcc_assert (TYPE_MODE (type) == mode);
3432
3433   if (!AGGREGATE_TYPE_P (type))
3434     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3435
3436   if (TREE_CODE (type) == ARRAY_TYPE)
3437     return TYPE_ALIGN (TREE_TYPE (type));
3438
3439   unsigned int alignment = 0;
3440   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3441     if (TREE_CODE (field) == FIELD_DECL)
3442       alignment = std::max (alignment, DECL_ALIGN (field));
3443
3444   return alignment;
3445 }
3446
3447 /* Layout a function argument according to the AAPCS64 rules.  The rule
3448    numbers refer to the rule numbers in the AAPCS64.  */
3449
3450 static void
3451 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3452                     const_tree type,
3453                     bool named ATTRIBUTE_UNUSED)
3454 {
3455   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3456   int ncrn, nvrn, nregs;
3457   bool allocate_ncrn, allocate_nvrn;
3458   HOST_WIDE_INT size;
3459
3460   /* We need to do this once per argument.  */
3461   if (pcum->aapcs_arg_processed)
3462     return;
3463
3464   pcum->aapcs_arg_processed = true;
3465
3466   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3467   if (type)
3468     size = int_size_in_bytes (type);
3469   else
3470     /* No frontends can create types with variable-sized modes, so we
3471        shouldn't be asked to pass or return them.  */
3472     size = GET_MODE_SIZE (mode).to_constant ();
3473   size = ROUND_UP (size, UNITS_PER_WORD);
3474
3475   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3476   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3477                                                  mode,
3478                                                  type,
3479                                                  &nregs);
3480
3481   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3482      The following code thus handles passing by SIMD/FP registers first.  */
3483
3484   nvrn = pcum->aapcs_nvrn;
3485
3486   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3487      and homogenous short-vector aggregates (HVA).  */
3488   if (allocate_nvrn)
3489     {
3490       if (!TARGET_FLOAT)
3491         aarch64_err_no_fpadvsimd (mode, "argument");
3492
3493       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3494         {
3495           pcum->aapcs_nextnvrn = nvrn + nregs;
3496           if (!aarch64_composite_type_p (type, mode))
3497             {
3498               gcc_assert (nregs == 1);
3499               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3500             }
3501           else
3502             {
3503               rtx par;
3504               int i;
3505               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3506               for (i = 0; i < nregs; i++)
3507                 {
3508                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3509                                          V0_REGNUM + nvrn + i);
3510                   rtx offset = gen_int_mode
3511                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3512                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3513                   XVECEXP (par, 0, i) = tmp;
3514                 }
3515               pcum->aapcs_reg = par;
3516             }
3517           return;
3518         }
3519       else
3520         {
3521           /* C.3 NSRN is set to 8.  */
3522           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3523           goto on_stack;
3524         }
3525     }
3526
3527   ncrn = pcum->aapcs_ncrn;
3528   nregs = size / UNITS_PER_WORD;
3529
3530   /* C6 - C9.  though the sign and zero extension semantics are
3531      handled elsewhere.  This is the case where the argument fits
3532      entirely general registers.  */
3533   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3534     {
3535
3536       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3537
3538       /* C.8 if the argument has an alignment of 16 then the NGRN is
3539          rounded up to the next even number.  */
3540       if (nregs == 2
3541           && ncrn % 2
3542           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3543              comparison is there because for > 16 * BITS_PER_UNIT
3544              alignment nregs should be > 2 and therefore it should be
3545              passed by reference rather than value.  */
3546           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3547         {
3548           ++ncrn;
3549           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3550         }
3551
3552       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3553          A reg is still generated for it, but the caller should be smart
3554          enough not to use it.  */
3555       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3556         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3557       else
3558         {
3559           rtx par;
3560           int i;
3561
3562           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3563           for (i = 0; i < nregs; i++)
3564             {
3565               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3566               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3567                                        GEN_INT (i * UNITS_PER_WORD));
3568               XVECEXP (par, 0, i) = tmp;
3569             }
3570           pcum->aapcs_reg = par;
3571         }
3572
3573       pcum->aapcs_nextncrn = ncrn + nregs;
3574       return;
3575     }
3576
3577   /* C.11  */
3578   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3579
3580   /* The argument is passed on stack; record the needed number of words for
3581      this argument and align the total size if necessary.  */
3582 on_stack:
3583   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3584
3585   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3586     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3587                                        16 / UNITS_PER_WORD);
3588   return;
3589 }
3590
3591 /* Implement TARGET_FUNCTION_ARG.  */
3592
3593 static rtx
3594 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3595                       const_tree type, bool named)
3596 {
3597   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3598   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3599
3600   if (mode == VOIDmode)
3601     return NULL_RTX;
3602
3603   aarch64_layout_arg (pcum_v, mode, type, named);
3604   return pcum->aapcs_reg;
3605 }
3606
3607 void
3608 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3609                            const_tree fntype ATTRIBUTE_UNUSED,
3610                            rtx libname ATTRIBUTE_UNUSED,
3611                            const_tree fndecl ATTRIBUTE_UNUSED,
3612                            unsigned n_named ATTRIBUTE_UNUSED)
3613 {
3614   pcum->aapcs_ncrn = 0;
3615   pcum->aapcs_nvrn = 0;
3616   pcum->aapcs_nextncrn = 0;
3617   pcum->aapcs_nextnvrn = 0;
3618   pcum->pcs_variant = ARM_PCS_AAPCS64;
3619   pcum->aapcs_reg = NULL_RTX;
3620   pcum->aapcs_arg_processed = false;
3621   pcum->aapcs_stack_words = 0;
3622   pcum->aapcs_stack_size = 0;
3623
3624   if (!TARGET_FLOAT
3625       && fndecl && TREE_PUBLIC (fndecl)
3626       && fntype && fntype != error_mark_node)
3627     {
3628       const_tree type = TREE_TYPE (fntype);
3629       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3630       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3631       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3632                                                    &mode, &nregs, NULL))
3633         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3634     }
3635   return;
3636 }
3637
3638 static void
3639 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3640                               machine_mode mode,
3641                               const_tree type,
3642                               bool named)
3643 {
3644   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3645   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3646     {
3647       aarch64_layout_arg (pcum_v, mode, type, named);
3648       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3649                   != (pcum->aapcs_stack_words != 0));
3650       pcum->aapcs_arg_processed = false;
3651       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3652       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3653       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3654       pcum->aapcs_stack_words = 0;
3655       pcum->aapcs_reg = NULL_RTX;
3656     }
3657 }
3658
3659 bool
3660 aarch64_function_arg_regno_p (unsigned regno)
3661 {
3662   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3663           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3664 }
3665
3666 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3667    PARM_BOUNDARY bits of alignment, but will be given anything up
3668    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3669    that both before and after the layout of each argument, the Next
3670    Stacked Argument Address (NSAA) will have a minimum alignment of
3671    8 bytes.  */
3672
3673 static unsigned int
3674 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3675 {
3676   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3677   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3678 }
3679
3680 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3681
3682 static fixed_size_mode
3683 aarch64_get_reg_raw_mode (int regno)
3684 {
3685   if (TARGET_SVE && FP_REGNUM_P (regno))
3686     /* Don't use the SVE part of the register for __builtin_apply and
3687        __builtin_return.  The SVE registers aren't used by the normal PCS,
3688        so using them there would be a waste of time.  The PCS extensions
3689        for SVE types are fundamentally incompatible with the
3690        __builtin_return/__builtin_apply interface.  */
3691     return as_a <fixed_size_mode> (V16QImode);
3692   return default_get_reg_raw_mode (regno);
3693 }
3694
3695 /* Implement TARGET_FUNCTION_ARG_PADDING.
3696
3697    Small aggregate types are placed in the lowest memory address.
3698
3699    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3700
3701 static pad_direction
3702 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3703 {
3704   /* On little-endian targets, the least significant byte of every stack
3705      argument is passed at the lowest byte address of the stack slot.  */
3706   if (!BYTES_BIG_ENDIAN)
3707     return PAD_UPWARD;
3708
3709   /* Otherwise, integral, floating-point and pointer types are padded downward:
3710      the least significant byte of a stack argument is passed at the highest
3711      byte address of the stack slot.  */
3712   if (type
3713       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3714          || POINTER_TYPE_P (type))
3715       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3716     return PAD_DOWNWARD;
3717
3718   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3719   return PAD_UPWARD;
3720 }
3721
3722 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3723
3724    It specifies padding for the last (may also be the only)
3725    element of a block move between registers and memory.  If
3726    assuming the block is in the memory, padding upward means that
3727    the last element is padded after its highest significant byte,
3728    while in downward padding, the last element is padded at the
3729    its least significant byte side.
3730
3731    Small aggregates and small complex types are always padded
3732    upwards.
3733
3734    We don't need to worry about homogeneous floating-point or
3735    short-vector aggregates; their move is not affected by the
3736    padding direction determined here.  Regardless of endianness,
3737    each element of such an aggregate is put in the least
3738    significant bits of a fp/simd register.
3739
3740    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3741    register has useful data, and return the opposite if the most
3742    significant byte does.  */
3743
3744 bool
3745 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3746                      bool first ATTRIBUTE_UNUSED)
3747 {
3748
3749   /* Small composite types are always padded upward.  */
3750   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3751     {
3752       HOST_WIDE_INT size;
3753       if (type)
3754         size = int_size_in_bytes (type);
3755       else
3756         /* No frontends can create types with variable-sized modes, so we
3757            shouldn't be asked to pass or return them.  */
3758         size = GET_MODE_SIZE (mode).to_constant ();
3759       if (size < 2 * UNITS_PER_WORD)
3760         return true;
3761     }
3762
3763   /* Otherwise, use the default padding.  */
3764   return !BYTES_BIG_ENDIAN;
3765 }
3766
3767 static scalar_int_mode
3768 aarch64_libgcc_cmp_return_mode (void)
3769 {
3770   return SImode;
3771 }
3772
3773 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3774
3775 /* We use the 12-bit shifted immediate arithmetic instructions so values
3776    must be multiple of (1 << 12), i.e. 4096.  */
3777 #define ARITH_FACTOR 4096
3778
3779 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3780 #error Cannot use simple address calculation for stack probing
3781 #endif
3782
3783 /* The pair of scratch registers used for stack probing.  */
3784 #define PROBE_STACK_FIRST_REG  9
3785 #define PROBE_STACK_SECOND_REG 10
3786
3787 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3788    inclusive.  These are offsets from the current stack pointer.  */
3789
3790 static void
3791 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3792 {
3793   HOST_WIDE_INT size;
3794   if (!poly_size.is_constant (&size))
3795     {
3796       sorry ("stack probes for SVE frames");
3797       return;
3798     }
3799
3800   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3801
3802   /* See the same assertion on PROBE_INTERVAL above.  */
3803   gcc_assert ((first % ARITH_FACTOR) == 0);
3804
3805   /* See if we have a constant small number of probes to generate.  If so,
3806      that's the easy case.  */
3807   if (size <= PROBE_INTERVAL)
3808     {
3809       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3810
3811       emit_set_insn (reg1,
3812                      plus_constant (Pmode,
3813                                     stack_pointer_rtx, -(first + base)));
3814       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3815     }
3816
3817   /* The run-time loop is made up of 8 insns in the generic case while the
3818      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3819   else if (size <= 4 * PROBE_INTERVAL)
3820     {
3821       HOST_WIDE_INT i, rem;
3822
3823       emit_set_insn (reg1,
3824                      plus_constant (Pmode,
3825                                     stack_pointer_rtx,
3826                                     -(first + PROBE_INTERVAL)));
3827       emit_stack_probe (reg1);
3828
3829       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3830          it exceeds SIZE.  If only two probes are needed, this will not
3831          generate any code.  Then probe at FIRST + SIZE.  */
3832       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3833         {
3834           emit_set_insn (reg1,
3835                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3836           emit_stack_probe (reg1);
3837         }
3838
3839       rem = size - (i - PROBE_INTERVAL);
3840       if (rem > 256)
3841         {
3842           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3843
3844           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3845           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3846         }
3847       else
3848         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3849     }
3850
3851   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3852      extra careful with variables wrapping around because we might be at
3853      the very top (or the very bottom) of the address space and we have
3854      to be able to handle this case properly; in particular, we use an
3855      equality test for the loop condition.  */
3856   else
3857     {
3858       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3859
3860       /* Step 1: round SIZE to the previous multiple of the interval.  */
3861
3862       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3863
3864
3865       /* Step 2: compute initial and final value of the loop counter.  */
3866
3867       /* TEST_ADDR = SP + FIRST.  */
3868       emit_set_insn (reg1,
3869                      plus_constant (Pmode, stack_pointer_rtx, -first));
3870
3871       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3872       HOST_WIDE_INT adjustment = - (first + rounded_size);
3873       if (! aarch64_uimm12_shift (adjustment))
3874         {
3875           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3876                                           true, Pmode);
3877           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3878         }
3879       else
3880         emit_set_insn (reg2,
3881                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3882
3883       /* Step 3: the loop
3884
3885          do
3886            {
3887              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3888              probe at TEST_ADDR
3889            }
3890          while (TEST_ADDR != LAST_ADDR)
3891
3892          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3893          until it is equal to ROUNDED_SIZE.  */
3894
3895       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3896
3897
3898       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3899          that SIZE is equal to ROUNDED_SIZE.  */
3900
3901       if (size != rounded_size)
3902         {
3903           HOST_WIDE_INT rem = size - rounded_size;
3904
3905           if (rem > 256)
3906             {
3907               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3908
3909               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3910               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3911             }
3912           else
3913             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3914         }
3915     }
3916
3917   /* Make sure nothing is scheduled before we are done.  */
3918   emit_insn (gen_blockage ());
3919 }
3920
3921 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3922    absolute addresses.  */
3923
3924 const char *
3925 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3926 {
3927   static int labelno = 0;
3928   char loop_lab[32];
3929   rtx xops[2];
3930
3931   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3932
3933   /* Loop.  */
3934   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3935
3936   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3937   xops[0] = reg1;
3938   xops[1] = GEN_INT (PROBE_INTERVAL);
3939   output_asm_insn ("sub\t%0, %0, %1", xops);
3940
3941   /* Probe at TEST_ADDR.  */
3942   output_asm_insn ("str\txzr, [%0]", xops);
3943
3944   /* Test if TEST_ADDR == LAST_ADDR.  */
3945   xops[1] = reg2;
3946   output_asm_insn ("cmp\t%0, %1", xops);
3947
3948   /* Branch.  */
3949   fputs ("\tb.ne\t", asm_out_file);
3950   assemble_name_raw (asm_out_file, loop_lab);
3951   fputc ('\n', asm_out_file);
3952
3953   return "";
3954 }
3955
3956 /* Determine whether a frame chain needs to be generated.  */
3957 static bool
3958 aarch64_needs_frame_chain (void)
3959 {
3960   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3961   if (frame_pointer_needed || crtl->calls_eh_return)
3962     return true;
3963
3964   /* A leaf function cannot have calls or write LR.  */
3965   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
3966
3967   /* Don't use a frame chain in leaf functions if leaf frame pointers
3968      are disabled.  */
3969   if (flag_omit_leaf_frame_pointer && is_leaf)
3970     return false;
3971
3972   return aarch64_use_frame_pointer;
3973 }
3974
3975 /* Mark the registers that need to be saved by the callee and calculate
3976    the size of the callee-saved registers area and frame record (both FP
3977    and LR may be omitted).  */
3978 static void
3979 aarch64_layout_frame (void)
3980 {
3981   HOST_WIDE_INT offset = 0;
3982   int regno, last_fp_reg = INVALID_REGNUM;
3983
3984   if (reload_completed && cfun->machine->frame.laid_out)
3985     return;
3986
3987   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
3988
3989 #define SLOT_NOT_REQUIRED (-2)
3990 #define SLOT_REQUIRED     (-1)
3991
3992   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3993   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3994
3995   /* First mark all the registers that really need to be saved...  */
3996   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3997     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3998
3999   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4000     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4001
4002   /* ... that includes the eh data registers (if needed)...  */
4003   if (crtl->calls_eh_return)
4004     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4005       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4006         = SLOT_REQUIRED;
4007
4008   /* ... and any callee saved register that dataflow says is live.  */
4009   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4010     if (df_regs_ever_live_p (regno)
4011         && (regno == R30_REGNUM
4012             || !call_used_regs[regno]))
4013       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4014
4015   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4016     if (df_regs_ever_live_p (regno)
4017         && !call_used_regs[regno])
4018       {
4019         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4020         last_fp_reg = regno;
4021       }
4022
4023   if (cfun->machine->frame.emit_frame_chain)
4024     {
4025       /* FP and LR are placed in the linkage record.  */
4026       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4027       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4028       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4029       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4030       offset = 2 * UNITS_PER_WORD;
4031     }
4032
4033   /* Now assign stack slots for them.  */
4034   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4035     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4036       {
4037         cfun->machine->frame.reg_offset[regno] = offset;
4038         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4039           cfun->machine->frame.wb_candidate1 = regno;
4040         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4041           cfun->machine->frame.wb_candidate2 = regno;
4042         offset += UNITS_PER_WORD;
4043       }
4044
4045   HOST_WIDE_INT max_int_offset = offset;
4046   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4047   bool has_align_gap = offset != max_int_offset;
4048
4049   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4050     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4051       {
4052         /* If there is an alignment gap between integer and fp callee-saves,
4053            allocate the last fp register to it if possible.  */
4054         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4055           {
4056             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4057             break;
4058           }
4059
4060         cfun->machine->frame.reg_offset[regno] = offset;
4061         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4062           cfun->machine->frame.wb_candidate1 = regno;
4063         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4064                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4065           cfun->machine->frame.wb_candidate2 = regno;
4066         offset += UNITS_PER_WORD;
4067       }
4068
4069   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4070
4071   cfun->machine->frame.saved_regs_size = offset;
4072
4073   HOST_WIDE_INT varargs_and_saved_regs_size
4074     = offset + cfun->machine->frame.saved_varargs_size;
4075
4076   cfun->machine->frame.hard_fp_offset
4077     = aligned_upper_bound (varargs_and_saved_regs_size
4078                            + get_frame_size (),
4079                            STACK_BOUNDARY / BITS_PER_UNIT);
4080
4081   /* Both these values are already aligned.  */
4082   gcc_assert (multiple_p (crtl->outgoing_args_size,
4083                           STACK_BOUNDARY / BITS_PER_UNIT));
4084   cfun->machine->frame.frame_size
4085     = (cfun->machine->frame.hard_fp_offset
4086        + crtl->outgoing_args_size);
4087
4088   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4089
4090   cfun->machine->frame.initial_adjust = 0;
4091   cfun->machine->frame.final_adjust = 0;
4092   cfun->machine->frame.callee_adjust = 0;
4093   cfun->machine->frame.callee_offset = 0;
4094
4095   HOST_WIDE_INT max_push_offset = 0;
4096   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4097     max_push_offset = 512;
4098   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4099     max_push_offset = 256;
4100
4101   HOST_WIDE_INT const_size, const_fp_offset;
4102   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4103       && const_size < max_push_offset
4104       && known_eq (crtl->outgoing_args_size, 0))
4105     {
4106       /* Simple, small frame with no outgoing arguments:
4107          stp reg1, reg2, [sp, -frame_size]!
4108          stp reg3, reg4, [sp, 16]  */
4109       cfun->machine->frame.callee_adjust = const_size;
4110     }
4111   else if (known_lt (crtl->outgoing_args_size
4112                      + cfun->machine->frame.saved_regs_size, 512)
4113            && !(cfun->calls_alloca
4114                 && known_lt (cfun->machine->frame.hard_fp_offset,
4115                              max_push_offset)))
4116     {
4117       /* Frame with small outgoing arguments:
4118          sub sp, sp, frame_size
4119          stp reg1, reg2, [sp, outgoing_args_size]
4120          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4121       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4122       cfun->machine->frame.callee_offset
4123         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4124     }
4125   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4126            && const_fp_offset < max_push_offset)
4127     {
4128       /* Frame with large outgoing arguments but a small local area:
4129          stp reg1, reg2, [sp, -hard_fp_offset]!
4130          stp reg3, reg4, [sp, 16]
4131          sub sp, sp, outgoing_args_size  */
4132       cfun->machine->frame.callee_adjust = const_fp_offset;
4133       cfun->machine->frame.final_adjust
4134         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4135     }
4136   else
4137     {
4138       /* Frame with large local area and outgoing arguments using frame pointer:
4139          sub sp, sp, hard_fp_offset
4140          stp x29, x30, [sp, 0]
4141          add x29, sp, 0
4142          stp reg3, reg4, [sp, 16]
4143          sub sp, sp, outgoing_args_size  */
4144       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4145       cfun->machine->frame.final_adjust
4146         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4147     }
4148
4149   cfun->machine->frame.laid_out = true;
4150 }
4151
4152 /* Return true if the register REGNO is saved on entry to
4153    the current function.  */
4154
4155 static bool
4156 aarch64_register_saved_on_entry (int regno)
4157 {
4158   return cfun->machine->frame.reg_offset[regno] >= 0;
4159 }
4160
4161 /* Return the next register up from REGNO up to LIMIT for the callee
4162    to save.  */
4163
4164 static unsigned
4165 aarch64_next_callee_save (unsigned regno, unsigned limit)
4166 {
4167   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4168     regno ++;
4169   return regno;
4170 }
4171
4172 /* Push the register number REGNO of mode MODE to the stack with write-back
4173    adjusting the stack by ADJUSTMENT.  */
4174
4175 static void
4176 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4177                            HOST_WIDE_INT adjustment)
4178  {
4179   rtx base_rtx = stack_pointer_rtx;
4180   rtx insn, reg, mem;
4181
4182   reg = gen_rtx_REG (mode, regno);
4183   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4184                             plus_constant (Pmode, base_rtx, -adjustment));
4185   mem = gen_frame_mem (mode, mem);
4186
4187   insn = emit_move_insn (mem, reg);
4188   RTX_FRAME_RELATED_P (insn) = 1;
4189 }
4190
4191 /* Generate and return an instruction to store the pair of registers
4192    REG and REG2 of mode MODE to location BASE with write-back adjusting
4193    the stack location BASE by ADJUSTMENT.  */
4194
4195 static rtx
4196 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4197                           HOST_WIDE_INT adjustment)
4198 {
4199   switch (mode)
4200     {
4201     case E_DImode:
4202       return gen_storewb_pairdi_di (base, base, reg, reg2,
4203                                     GEN_INT (-adjustment),
4204                                     GEN_INT (UNITS_PER_WORD - adjustment));
4205     case E_DFmode:
4206       return gen_storewb_pairdf_di (base, base, reg, reg2,
4207                                     GEN_INT (-adjustment),
4208                                     GEN_INT (UNITS_PER_WORD - adjustment));
4209     default:
4210       gcc_unreachable ();
4211     }
4212 }
4213
4214 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4215    stack pointer by ADJUSTMENT.  */
4216
4217 static void
4218 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4219 {
4220   rtx_insn *insn;
4221   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4222
4223   if (regno2 == INVALID_REGNUM)
4224     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4225
4226   rtx reg1 = gen_rtx_REG (mode, regno1);
4227   rtx reg2 = gen_rtx_REG (mode, regno2);
4228
4229   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4230                                               reg2, adjustment));
4231   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4232   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4233   RTX_FRAME_RELATED_P (insn) = 1;
4234 }
4235
4236 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4237    adjusting it by ADJUSTMENT afterwards.  */
4238
4239 static rtx
4240 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4241                          HOST_WIDE_INT adjustment)
4242 {
4243   switch (mode)
4244     {
4245     case E_DImode:
4246       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4247                                    GEN_INT (UNITS_PER_WORD));
4248     case E_DFmode:
4249       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4250                                    GEN_INT (UNITS_PER_WORD));
4251     default:
4252       gcc_unreachable ();
4253     }
4254 }
4255
4256 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4257    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4258    into CFI_OPS.  */
4259
4260 static void
4261 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4262                   rtx *cfi_ops)
4263 {
4264   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4265   rtx reg1 = gen_rtx_REG (mode, regno1);
4266
4267   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4268
4269   if (regno2 == INVALID_REGNUM)
4270     {
4271       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4272       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4273       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4274     }
4275   else
4276     {
4277       rtx reg2 = gen_rtx_REG (mode, regno2);
4278       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4279       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4280                                           reg2, adjustment));
4281     }
4282 }
4283
4284 /* Generate and return a store pair instruction of mode MODE to store
4285    register REG1 to MEM1 and register REG2 to MEM2.  */
4286
4287 static rtx
4288 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4289                         rtx reg2)
4290 {
4291   switch (mode)
4292     {
4293     case E_DImode:
4294       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4295
4296     case E_DFmode:
4297       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4298
4299     default:
4300       gcc_unreachable ();
4301     }
4302 }
4303
4304 /* Generate and regurn a load pair isntruction of mode MODE to load register
4305    REG1 from MEM1 and register REG2 from MEM2.  */
4306
4307 static rtx
4308 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4309                        rtx mem2)
4310 {
4311   switch (mode)
4312     {
4313     case E_DImode:
4314       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4315
4316     case E_DFmode:
4317       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4318
4319     default:
4320       gcc_unreachable ();
4321     }
4322 }
4323
4324 /* Return TRUE if return address signing should be enabled for the current
4325    function, otherwise return FALSE.  */
4326
4327 bool
4328 aarch64_return_address_signing_enabled (void)
4329 {
4330   /* This function should only be called after frame laid out.   */
4331   gcc_assert (cfun->machine->frame.laid_out);
4332
4333   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4334      if it's LR is pushed onto stack.  */
4335   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4336           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4337               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4338 }
4339
4340 /* Emit code to save the callee-saved registers from register number START
4341    to LIMIT to the stack at the location starting at offset START_OFFSET,
4342    skipping any write-back candidates if SKIP_WB is true.  */
4343
4344 static void
4345 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4346                            unsigned start, unsigned limit, bool skip_wb)
4347 {
4348   rtx_insn *insn;
4349   unsigned regno;
4350   unsigned regno2;
4351
4352   for (regno = aarch64_next_callee_save (start, limit);
4353        regno <= limit;
4354        regno = aarch64_next_callee_save (regno + 1, limit))
4355     {
4356       rtx reg, mem;
4357       poly_int64 offset;
4358
4359       if (skip_wb
4360           && (regno == cfun->machine->frame.wb_candidate1
4361               || regno == cfun->machine->frame.wb_candidate2))
4362         continue;
4363
4364       if (cfun->machine->reg_is_wrapped_separately[regno])
4365        continue;
4366
4367       reg = gen_rtx_REG (mode, regno);
4368       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4369       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4370                                                 offset));
4371
4372       regno2 = aarch64_next_callee_save (regno + 1, limit);
4373
4374       if (regno2 <= limit
4375           && !cfun->machine->reg_is_wrapped_separately[regno2]
4376           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4377               == cfun->machine->frame.reg_offset[regno2]))
4378
4379         {
4380           rtx reg2 = gen_rtx_REG (mode, regno2);
4381           rtx mem2;
4382
4383           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4384           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4385                                                      offset));
4386           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4387                                                     reg2));
4388
4389           /* The first part of a frame-related parallel insn is
4390              always assumed to be relevant to the frame
4391              calculations; subsequent parts, are only
4392              frame-related if explicitly marked.  */
4393           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4394           regno = regno2;
4395         }
4396       else
4397         insn = emit_move_insn (mem, reg);
4398
4399       RTX_FRAME_RELATED_P (insn) = 1;
4400     }
4401 }
4402
4403 /* Emit code to restore the callee registers of mode MODE from register
4404    number START up to and including LIMIT.  Restore from the stack offset
4405    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4406    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4407
4408 static void
4409 aarch64_restore_callee_saves (machine_mode mode,
4410                               poly_int64 start_offset, unsigned start,
4411                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4412 {
4413   rtx base_rtx = stack_pointer_rtx;
4414   unsigned regno;
4415   unsigned regno2;
4416   poly_int64 offset;
4417
4418   for (regno = aarch64_next_callee_save (start, limit);
4419        regno <= limit;
4420        regno = aarch64_next_callee_save (regno + 1, limit))
4421     {
4422       if (cfun->machine->reg_is_wrapped_separately[regno])
4423        continue;
4424
4425       rtx reg, mem;
4426
4427       if (skip_wb
4428           && (regno == cfun->machine->frame.wb_candidate1
4429               || regno == cfun->machine->frame.wb_candidate2))
4430         continue;
4431
4432       reg = gen_rtx_REG (mode, regno);
4433       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4434       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4435
4436       regno2 = aarch64_next_callee_save (regno + 1, limit);
4437
4438       if (regno2 <= limit
4439           && !cfun->machine->reg_is_wrapped_separately[regno2]
4440           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4441               == cfun->machine->frame.reg_offset[regno2]))
4442         {
4443           rtx reg2 = gen_rtx_REG (mode, regno2);
4444           rtx mem2;
4445
4446           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4447           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4448           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4449
4450           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4451           regno = regno2;
4452         }
4453       else
4454         emit_move_insn (reg, mem);
4455       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4456     }
4457 }
4458
4459 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4460    of MODE.  */
4461
4462 static inline bool
4463 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4464 {
4465   HOST_WIDE_INT multiple;
4466   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4467           && IN_RANGE (multiple, -8, 7));
4468 }
4469
4470 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4471    of MODE.  */
4472
4473 static inline bool
4474 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4475 {
4476   HOST_WIDE_INT multiple;
4477   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4478           && IN_RANGE (multiple, 0, 63));
4479 }
4480
4481 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4482    of MODE.  */
4483
4484 bool
4485 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4486 {
4487   HOST_WIDE_INT multiple;
4488   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4489           && IN_RANGE (multiple, -64, 63));
4490 }
4491
4492 /* Return true if OFFSET is a signed 9-bit value.  */
4493
4494 static inline bool
4495 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4496                                poly_int64 offset)
4497 {
4498   HOST_WIDE_INT const_offset;
4499   return (offset.is_constant (&const_offset)
4500           && IN_RANGE (const_offset, -256, 255));
4501 }
4502
4503 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4504    of MODE.  */
4505
4506 static inline bool
4507 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4508 {
4509   HOST_WIDE_INT multiple;
4510   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4511           && IN_RANGE (multiple, -256, 255));
4512 }
4513
4514 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4515    of MODE.  */
4516
4517 static inline bool
4518 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4519 {
4520   HOST_WIDE_INT multiple;
4521   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4522           && IN_RANGE (multiple, 0, 4095));
4523 }
4524
4525 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4526
4527 static sbitmap
4528 aarch64_get_separate_components (void)
4529 {
4530   aarch64_layout_frame ();
4531
4532   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4533   bitmap_clear (components);
4534
4535   /* The registers we need saved to the frame.  */
4536   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4537     if (aarch64_register_saved_on_entry (regno))
4538       {
4539         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4540         if (!frame_pointer_needed)
4541           offset += cfun->machine->frame.frame_size
4542                     - cfun->machine->frame.hard_fp_offset;
4543         /* Check that we can access the stack slot of the register with one
4544            direct load with no adjustments needed.  */
4545         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4546           bitmap_set_bit (components, regno);
4547       }
4548
4549   /* Don't mess with the hard frame pointer.  */
4550   if (frame_pointer_needed)
4551     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4552
4553   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4554   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4555   /* If aarch64_layout_frame has chosen registers to store/restore with
4556      writeback don't interfere with them to avoid having to output explicit
4557      stack adjustment instructions.  */
4558   if (reg2 != INVALID_REGNUM)
4559     bitmap_clear_bit (components, reg2);
4560   if (reg1 != INVALID_REGNUM)
4561     bitmap_clear_bit (components, reg1);
4562
4563   bitmap_clear_bit (components, LR_REGNUM);
4564   bitmap_clear_bit (components, SP_REGNUM);
4565
4566   return components;
4567 }
4568
4569 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4570
4571 static sbitmap
4572 aarch64_components_for_bb (basic_block bb)
4573 {
4574   bitmap in = DF_LIVE_IN (bb);
4575   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4576   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4577
4578   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4579   bitmap_clear (components);
4580
4581   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4582   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4583     if ((!call_used_regs[regno])
4584        && (bitmap_bit_p (in, regno)
4585            || bitmap_bit_p (gen, regno)
4586            || bitmap_bit_p (kill, regno)))
4587       {
4588         unsigned regno2, offset, offset2;
4589         bitmap_set_bit (components, regno);
4590
4591         /* If there is a callee-save at an adjacent offset, add it too
4592            to increase the use of LDP/STP.  */
4593         offset = cfun->machine->frame.reg_offset[regno];
4594         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4595
4596         if (regno2 <= LAST_SAVED_REGNUM)
4597           {
4598             offset2 = cfun->machine->frame.reg_offset[regno2];
4599             if ((offset & ~8) == (offset2 & ~8))
4600               bitmap_set_bit (components, regno2);
4601           }
4602       }
4603
4604   return components;
4605 }
4606
4607 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4608    Nothing to do for aarch64.  */
4609
4610 static void
4611 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4612 {
4613 }
4614
4615 /* Return the next set bit in BMP from START onwards.  Return the total number
4616    of bits in BMP if no set bit is found at or after START.  */
4617
4618 static unsigned int
4619 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4620 {
4621   unsigned int nbits = SBITMAP_SIZE (bmp);
4622   if (start == nbits)
4623     return start;
4624
4625   gcc_assert (start < nbits);
4626   for (unsigned int i = start; i < nbits; i++)
4627     if (bitmap_bit_p (bmp, i))
4628       return i;
4629
4630   return nbits;
4631 }
4632
4633 /* Do the work for aarch64_emit_prologue_components and
4634    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4635    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4636    for these components or the epilogue sequence.  That is, it determines
4637    whether we should emit stores or loads and what kind of CFA notes to attach
4638    to the insns.  Otherwise the logic for the two sequences is very
4639    similar.  */
4640
4641 static void
4642 aarch64_process_components (sbitmap components, bool prologue_p)
4643 {
4644   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4645                              ? HARD_FRAME_POINTER_REGNUM
4646                              : STACK_POINTER_REGNUM);
4647
4648   unsigned last_regno = SBITMAP_SIZE (components);
4649   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4650   rtx_insn *insn = NULL;
4651
4652   while (regno != last_regno)
4653     {
4654       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4655          so DFmode for the vector registers is enough.  */
4656       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4657       rtx reg = gen_rtx_REG (mode, regno);
4658       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4659       if (!frame_pointer_needed)
4660         offset += cfun->machine->frame.frame_size
4661                   - cfun->machine->frame.hard_fp_offset;
4662       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4663       rtx mem = gen_frame_mem (mode, addr);
4664
4665       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4666       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4667       /* No more registers to handle after REGNO.
4668          Emit a single save/restore and exit.  */
4669       if (regno2 == last_regno)
4670         {
4671           insn = emit_insn (set);
4672           RTX_FRAME_RELATED_P (insn) = 1;
4673           if (prologue_p)
4674             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4675           else
4676             add_reg_note (insn, REG_CFA_RESTORE, reg);
4677           break;
4678         }
4679
4680       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4681       /* The next register is not of the same class or its offset is not
4682          mergeable with the current one into a pair.  */
4683       if (!satisfies_constraint_Ump (mem)
4684           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4685           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4686                        GET_MODE_SIZE (mode)))
4687         {
4688           insn = emit_insn (set);
4689           RTX_FRAME_RELATED_P (insn) = 1;
4690           if (prologue_p)
4691             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4692           else
4693             add_reg_note (insn, REG_CFA_RESTORE, reg);
4694
4695           regno = regno2;
4696           continue;
4697         }
4698
4699       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4700       rtx reg2 = gen_rtx_REG (mode, regno2);
4701       if (!frame_pointer_needed)
4702         offset2 += cfun->machine->frame.frame_size
4703                   - cfun->machine->frame.hard_fp_offset;
4704       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4705       rtx mem2 = gen_frame_mem (mode, addr2);
4706       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4707                              : gen_rtx_SET (reg2, mem2);
4708
4709       if (prologue_p)
4710         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4711       else
4712         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4713
4714       RTX_FRAME_RELATED_P (insn) = 1;
4715       if (prologue_p)
4716         {
4717           add_reg_note (insn, REG_CFA_OFFSET, set);
4718           add_reg_note (insn, REG_CFA_OFFSET, set2);
4719         }
4720       else
4721         {
4722           add_reg_note (insn, REG_CFA_RESTORE, reg);
4723           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4724         }
4725
4726       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4727     }
4728 }
4729
4730 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4731
4732 static void
4733 aarch64_emit_prologue_components (sbitmap components)
4734 {
4735   aarch64_process_components (components, true);
4736 }
4737
4738 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4739
4740 static void
4741 aarch64_emit_epilogue_components (sbitmap components)
4742 {
4743   aarch64_process_components (components, false);
4744 }
4745
4746 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4747
4748 static void
4749 aarch64_set_handled_components (sbitmap components)
4750 {
4751   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4752     if (bitmap_bit_p (components, regno))
4753       cfun->machine->reg_is_wrapped_separately[regno] = true;
4754 }
4755
4756 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4757    is saved at BASE + OFFSET.  */
4758
4759 static void
4760 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4761                             rtx base, poly_int64 offset)
4762 {
4763   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4764   add_reg_note (insn, REG_CFA_EXPRESSION,
4765                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4766 }
4767
4768 /* AArch64 stack frames generated by this compiler look like:
4769
4770         +-------------------------------+
4771         |                               |
4772         |  incoming stack arguments     |
4773         |                               |
4774         +-------------------------------+
4775         |                               | <-- incoming stack pointer (aligned)
4776         |  callee-allocated save area   |
4777         |  for register varargs         |
4778         |                               |
4779         +-------------------------------+
4780         |  local variables              | <-- frame_pointer_rtx
4781         |                               |
4782         +-------------------------------+
4783         |  padding0                     | \
4784         +-------------------------------+  |
4785         |  callee-saved registers       |  | frame.saved_regs_size
4786         +-------------------------------+  |
4787         |  LR'                          |  |
4788         +-------------------------------+  |
4789         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4790         +-------------------------------+
4791         |  dynamic allocation           |
4792         +-------------------------------+
4793         |  padding                      |
4794         +-------------------------------+
4795         |  outgoing stack arguments     | <-- arg_pointer
4796         |                               |
4797         +-------------------------------+
4798         |                               | <-- stack_pointer_rtx (aligned)
4799
4800    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4801    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4802    unchanged.  */
4803
4804 /* Generate the prologue instructions for entry into a function.
4805    Establish the stack frame by decreasing the stack pointer with a
4806    properly calculated size and, if necessary, create a frame record
4807    filled with the values of LR and previous frame pointer.  The
4808    current FP is also set up if it is in use.  */
4809
4810 void
4811 aarch64_expand_prologue (void)
4812 {
4813   aarch64_layout_frame ();
4814
4815   poly_int64 frame_size = cfun->machine->frame.frame_size;
4816   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4817   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4818   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4819   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4820   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4821   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4822   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4823   rtx_insn *insn;
4824
4825   /* Sign return address for functions.  */
4826   if (aarch64_return_address_signing_enabled ())
4827     {
4828       insn = emit_insn (gen_pacisp ());
4829       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4830       RTX_FRAME_RELATED_P (insn) = 1;
4831     }
4832
4833   if (flag_stack_usage_info)
4834     current_function_static_stack_size = constant_lower_bound (frame_size);
4835
4836   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4837     {
4838       if (crtl->is_leaf && !cfun->calls_alloca)
4839         {
4840           if (maybe_gt (frame_size, PROBE_INTERVAL)
4841               && maybe_gt (frame_size, get_stack_check_protect ()))
4842             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4843                                             (frame_size
4844                                              - get_stack_check_protect ()));
4845         }
4846       else if (maybe_gt (frame_size, 0))
4847         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4848     }
4849
4850   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4851   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4852
4853   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4854
4855   if (callee_adjust != 0)
4856     aarch64_push_regs (reg1, reg2, callee_adjust);
4857
4858   if (emit_frame_chain)
4859     {
4860       poly_int64 reg_offset = callee_adjust;
4861       if (callee_adjust == 0)
4862         {
4863           reg1 = R29_REGNUM;
4864           reg2 = R30_REGNUM;
4865           reg_offset = callee_offset;
4866           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4867         }
4868       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4869                           stack_pointer_rtx, callee_offset,
4870                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4871       if (frame_pointer_needed && !frame_size.is_constant ())
4872         {
4873           /* Variable-sized frames need to describe the save slot
4874              address using DW_CFA_expression rather than DW_CFA_offset.
4875              This means that, without taking further action, the
4876              locations of the registers that we've already saved would
4877              remain based on the stack pointer even after we redefine
4878              the CFA based on the frame pointer.  We therefore need new
4879              DW_CFA_expressions to re-express the save slots with addresses
4880              based on the frame pointer.  */
4881           rtx_insn *insn = get_last_insn ();
4882           gcc_assert (RTX_FRAME_RELATED_P (insn));
4883
4884           /* Add an explicit CFA definition if this was previously
4885              implicit.  */
4886           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4887             {
4888               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4889                                        callee_offset);
4890               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4891                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4892             }
4893
4894           /* Change the save slot expressions for the registers that
4895              we've already saved.  */
4896           reg_offset -= callee_offset;
4897           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4898                                       reg_offset + UNITS_PER_WORD);
4899           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4900                                       reg_offset);
4901         }
4902       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4903     }
4904
4905   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4906                              callee_adjust != 0 || emit_frame_chain);
4907   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4908                              callee_adjust != 0 || emit_frame_chain);
4909   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4910 }
4911
4912 /* Return TRUE if we can use a simple_return insn.
4913
4914    This function checks whether the callee saved stack is empty, which
4915    means no restore actions are need. The pro_and_epilogue will use
4916    this to check whether shrink-wrapping opt is feasible.  */
4917
4918 bool
4919 aarch64_use_return_insn_p (void)
4920 {
4921   if (!reload_completed)
4922     return false;
4923
4924   if (crtl->profile)
4925     return false;
4926
4927   aarch64_layout_frame ();
4928
4929   return known_eq (cfun->machine->frame.frame_size, 0);
4930 }
4931
4932 /* Generate the epilogue instructions for returning from a function.
4933    This is almost exactly the reverse of the prolog sequence, except
4934    that we need to insert barriers to avoid scheduling loads that read
4935    from a deallocated stack, and we optimize the unwind records by
4936    emitting them all together if possible.  */
4937 void
4938 aarch64_expand_epilogue (bool for_sibcall)
4939 {
4940   aarch64_layout_frame ();
4941
4942   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4943   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4944   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4945   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4946   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4947   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4948   rtx cfi_ops = NULL;
4949   rtx_insn *insn;
4950   /* A stack clash protection prologue may not have left IP0_REGNUM or
4951      IP1_REGNUM in a usable state.  The same is true for allocations
4952      with an SVE component, since we then need both temporary registers
4953      for each allocation.  */
4954   bool can_inherit_p = (initial_adjust.is_constant ()
4955                         && final_adjust.is_constant ()
4956                         && !flag_stack_clash_protection);
4957
4958   /* We need to add memory barrier to prevent read from deallocated stack.  */
4959   bool need_barrier_p
4960     = maybe_ne (get_frame_size ()
4961                 + cfun->machine->frame.saved_varargs_size, 0);
4962
4963   /* Emit a barrier to prevent loads from a deallocated stack.  */
4964   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4965       || cfun->calls_alloca
4966       || crtl->calls_eh_return)
4967     {
4968       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4969       need_barrier_p = false;
4970     }
4971
4972   /* Restore the stack pointer from the frame pointer if it may not
4973      be the same as the stack pointer.  */
4974   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4975   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4976   if (frame_pointer_needed
4977       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4978     /* If writeback is used when restoring callee-saves, the CFA
4979        is restored on the instruction doing the writeback.  */
4980     aarch64_add_offset (Pmode, stack_pointer_rtx,
4981                         hard_frame_pointer_rtx, -callee_offset,
4982                         ip1_rtx, ip0_rtx, callee_adjust == 0);
4983   else
4984     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4985                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4986
4987   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4988                                 callee_adjust != 0, &cfi_ops);
4989   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4990                                 callee_adjust != 0, &cfi_ops);
4991
4992   if (need_barrier_p)
4993     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4994
4995   if (callee_adjust != 0)
4996     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4997
4998   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4999     {
5000       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5001       insn = get_last_insn ();
5002       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5003       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5004       RTX_FRAME_RELATED_P (insn) = 1;
5005       cfi_ops = NULL;
5006     }
5007
5008   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5009                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5010
5011   if (cfi_ops)
5012     {
5013       /* Emit delayed restores and reset the CFA to be SP.  */
5014       insn = get_last_insn ();
5015       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5016       REG_NOTES (insn) = cfi_ops;
5017       RTX_FRAME_RELATED_P (insn) = 1;
5018     }
5019
5020   /* We prefer to emit the combined return/authenticate instruction RETAA,
5021      however there are three cases in which we must instead emit an explicit
5022      authentication instruction.
5023
5024         1) Sibcalls don't return in a normal way, so if we're about to call one
5025            we must authenticate.
5026
5027         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5028            generating code for !TARGET_ARMV8_3 we can't use it and must
5029            explicitly authenticate.
5030
5031         3) On an eh_return path we make extra stack adjustments to update the
5032            canonical frame address to be the exception handler's CFA.  We want
5033            to authenticate using the CFA of the function which calls eh_return.
5034     */
5035   if (aarch64_return_address_signing_enabled ()
5036       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5037     {
5038       insn = emit_insn (gen_autisp ());
5039       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5040       RTX_FRAME_RELATED_P (insn) = 1;
5041     }
5042
5043   /* Stack adjustment for exception handler.  */
5044   if (crtl->calls_eh_return)
5045     {
5046       /* We need to unwind the stack by the offset computed by
5047          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5048          to be SP; letting the CFA move during this adjustment
5049          is just as correct as retaining the CFA from the body
5050          of the function.  Therefore, do nothing special.  */
5051       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5052     }
5053
5054   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5055   if (!for_sibcall)
5056     emit_jump_insn (ret_rtx);
5057 }
5058
5059 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5060    normally or return to a previous frame after unwinding.
5061
5062    An EH return uses a single shared return sequence.  The epilogue is
5063    exactly like a normal epilogue except that it has an extra input
5064    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5065    that must be applied after the frame has been destroyed.  An extra label
5066    is inserted before the epilogue which initializes this register to zero,
5067    and this is the entry point for a normal return.
5068
5069    An actual EH return updates the return address, initializes the stack
5070    adjustment and jumps directly into the epilogue (bypassing the zeroing
5071    of the adjustment).  Since the return address is typically saved on the
5072    stack when a function makes a call, the saved LR must be updated outside
5073    the epilogue.
5074
5075    This poses problems as the store is generated well before the epilogue,
5076    so the offset of LR is not known yet.  Also optimizations will remove the
5077    store as it appears dead, even after the epilogue is generated (as the
5078    base or offset for loading LR is different in many cases).
5079
5080    To avoid these problems this implementation forces the frame pointer
5081    in eh_return functions so that the location of LR is fixed and known early.
5082    It also marks the store volatile, so no optimization is permitted to
5083    remove the store.  */
5084 rtx
5085 aarch64_eh_return_handler_rtx (void)
5086 {
5087   rtx tmp = gen_frame_mem (Pmode,
5088     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5089
5090   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5091   MEM_VOLATILE_P (tmp) = true;
5092   return tmp;
5093 }
5094
5095 /* Output code to add DELTA to the first argument, and then jump
5096    to FUNCTION.  Used for C++ multiple inheritance.  */
5097 static void
5098 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5099                          HOST_WIDE_INT delta,
5100                          HOST_WIDE_INT vcall_offset,
5101                          tree function)
5102 {
5103   /* The this pointer is always in x0.  Note that this differs from
5104      Arm where the this pointer maybe bumped to r1 if r0 is required
5105      to return a pointer to an aggregate.  On AArch64 a result value
5106      pointer will be in x8.  */
5107   int this_regno = R0_REGNUM;
5108   rtx this_rtx, temp0, temp1, addr, funexp;
5109   rtx_insn *insn;
5110
5111   reload_completed = 1;
5112   emit_note (NOTE_INSN_PROLOGUE_END);
5113
5114   this_rtx = gen_rtx_REG (Pmode, this_regno);
5115   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5116   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5117
5118   if (vcall_offset == 0)
5119     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5120   else
5121     {
5122       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5123
5124       addr = this_rtx;
5125       if (delta != 0)
5126         {
5127           if (delta >= -256 && delta < 256)
5128             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5129                                        plus_constant (Pmode, this_rtx, delta));
5130           else
5131             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5132                                 temp1, temp0, false);
5133         }
5134
5135       if (Pmode == ptr_mode)
5136         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5137       else
5138         aarch64_emit_move (temp0,
5139                            gen_rtx_ZERO_EXTEND (Pmode,
5140                                                 gen_rtx_MEM (ptr_mode, addr)));
5141
5142       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5143           addr = plus_constant (Pmode, temp0, vcall_offset);
5144       else
5145         {
5146           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5147                                           Pmode);
5148           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5149         }
5150
5151       if (Pmode == ptr_mode)
5152         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5153       else
5154         aarch64_emit_move (temp1,
5155                            gen_rtx_SIGN_EXTEND (Pmode,
5156                                                 gen_rtx_MEM (ptr_mode, addr)));
5157
5158       emit_insn (gen_add2_insn (this_rtx, temp1));
5159     }
5160
5161   /* Generate a tail call to the target function.  */
5162   if (!TREE_USED (function))
5163     {
5164       assemble_external (function);
5165       TREE_USED (function) = 1;
5166     }
5167   funexp = XEXP (DECL_RTL (function), 0);
5168   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5169   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5170   SIBLING_CALL_P (insn) = 1;
5171
5172   insn = get_insns ();
5173   shorten_branches (insn);
5174   final_start_function (insn, file, 1);
5175   final (insn, file, 1);
5176   final_end_function ();
5177
5178   /* Stop pretending to be a post-reload pass.  */
5179   reload_completed = 0;
5180 }
5181
5182 static bool
5183 aarch64_tls_referenced_p (rtx x)
5184 {
5185   if (!TARGET_HAVE_TLS)
5186     return false;
5187   subrtx_iterator::array_type array;
5188   FOR_EACH_SUBRTX (iter, array, x, ALL)
5189     {
5190       const_rtx x = *iter;
5191       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5192         return true;
5193       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5194          TLS offsets, not real symbol references.  */
5195       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5196         iter.skip_subrtxes ();
5197     }
5198   return false;
5199 }
5200
5201
5202 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5203    a left shift of 0 or 12 bits.  */
5204 bool
5205 aarch64_uimm12_shift (HOST_WIDE_INT val)
5206 {
5207   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5208           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5209           );
5210 }
5211
5212
5213 /* Return true if val is an immediate that can be loaded into a
5214    register by a MOVZ instruction.  */
5215 static bool
5216 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5217 {
5218   if (GET_MODE_SIZE (mode) > 4)
5219     {
5220       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5221           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5222         return 1;
5223     }
5224   else
5225     {
5226       /* Ignore sign extension.  */
5227       val &= (HOST_WIDE_INT) 0xffffffff;
5228     }
5229   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5230           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5231 }
5232
5233 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5234    64-bit (DImode) integer.  */
5235
5236 static unsigned HOST_WIDE_INT
5237 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5238 {
5239   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5240   while (size < 64)
5241     {
5242       val &= (HOST_WIDE_INT_1U << size) - 1;
5243       val |= val << size;
5244       size *= 2;
5245     }
5246   return val;
5247 }
5248
5249 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5250
5251 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5252   {
5253     0x0000000100000001ull,
5254     0x0001000100010001ull,
5255     0x0101010101010101ull,
5256     0x1111111111111111ull,
5257     0x5555555555555555ull,
5258   };
5259
5260
5261 /* Return true if val is a valid bitmask immediate.  */
5262
5263 bool
5264 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5265 {
5266   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5267   int bits;
5268
5269   /* Check for a single sequence of one bits and return quickly if so.
5270      The special cases of all ones and all zeroes returns false.  */
5271   val = aarch64_replicate_bitmask_imm (val_in, mode);
5272   tmp = val + (val & -val);
5273
5274   if (tmp == (tmp & -tmp))
5275     return (val + 1) > 1;
5276
5277   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5278   if (mode == SImode)
5279     val = (val << 32) | (val & 0xffffffff);
5280
5281   /* Invert if the immediate doesn't start with a zero bit - this means we
5282      only need to search for sequences of one bits.  */
5283   if (val & 1)
5284     val = ~val;
5285
5286   /* Find the first set bit and set tmp to val with the first sequence of one
5287      bits removed.  Return success if there is a single sequence of ones.  */
5288   first_one = val & -val;
5289   tmp = val & (val + first_one);
5290
5291   if (tmp == 0)
5292     return true;
5293
5294   /* Find the next set bit and compute the difference in bit position.  */
5295   next_one = tmp & -tmp;
5296   bits = clz_hwi (first_one) - clz_hwi (next_one);
5297   mask = val ^ tmp;
5298
5299   /* Check the bit position difference is a power of 2, and that the first
5300      sequence of one bits fits within 'bits' bits.  */
5301   if ((mask >> bits) != 0 || bits != (bits & -bits))
5302     return false;
5303
5304   /* Check the sequence of one bits is repeated 64/bits times.  */
5305   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5306 }
5307
5308 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5309    Assumed precondition: VAL_IN Is not zero.  */
5310
5311 unsigned HOST_WIDE_INT
5312 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5313 {
5314   int lowest_bit_set = ctz_hwi (val_in);
5315   int highest_bit_set = floor_log2 (val_in);
5316   gcc_assert (val_in != 0);
5317
5318   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5319           (HOST_WIDE_INT_1U << lowest_bit_set));
5320 }
5321
5322 /* Create constant where bits outside of lowest bit set to highest bit set
5323    are set to 1.  */
5324
5325 unsigned HOST_WIDE_INT
5326 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5327 {
5328   return val_in | ~aarch64_and_split_imm1 (val_in);
5329 }
5330
5331 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5332
5333 bool
5334 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5335 {
5336   scalar_int_mode int_mode;
5337   if (!is_a <scalar_int_mode> (mode, &int_mode))
5338     return false;
5339
5340   if (aarch64_bitmask_imm (val_in, int_mode))
5341     return false;
5342
5343   if (aarch64_move_imm (val_in, int_mode))
5344     return false;
5345
5346   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5347
5348   return aarch64_bitmask_imm (imm2, int_mode);
5349 }
5350
5351 /* Return true if val is an immediate that can be loaded into a
5352    register in a single instruction.  */
5353 bool
5354 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5355 {
5356   scalar_int_mode int_mode;
5357   if (!is_a <scalar_int_mode> (mode, &int_mode))
5358     return false;
5359
5360   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5361     return 1;
5362   return aarch64_bitmask_imm (val, int_mode);
5363 }
5364
5365 static bool
5366 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5367 {
5368   rtx base, offset;
5369
5370   if (GET_CODE (x) == HIGH)
5371     return true;
5372
5373   /* There's no way to calculate VL-based values using relocations.  */
5374   subrtx_iterator::array_type array;
5375   FOR_EACH_SUBRTX (iter, array, x, ALL)
5376     if (GET_CODE (*iter) == CONST_POLY_INT)
5377       return true;
5378
5379   split_const (x, &base, &offset);
5380   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5381     {
5382       if (aarch64_classify_symbol (base, INTVAL (offset))
5383           != SYMBOL_FORCE_TO_MEM)
5384         return true;
5385       else
5386         /* Avoid generating a 64-bit relocation in ILP32; leave
5387            to aarch64_expand_mov_immediate to handle it properly.  */
5388         return mode != ptr_mode;
5389     }
5390
5391   return aarch64_tls_referenced_p (x);
5392 }
5393
5394 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5395    The expansion for a table switch is quite expensive due to the number
5396    of instructions, the table lookup and hard to predict indirect jump.
5397    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5398    set, otherwise use tables for > 16 cases as a tradeoff between size and
5399    performance.  When optimizing for size, use the default setting.  */
5400
5401 static unsigned int
5402 aarch64_case_values_threshold (void)
5403 {
5404   /* Use the specified limit for the number of cases before using jump
5405      tables at higher optimization levels.  */
5406   if (optimize > 2
5407       && selected_cpu->tune->max_case_values != 0)
5408     return selected_cpu->tune->max_case_values;
5409   else
5410     return optimize_size ? default_case_values_threshold () : 17;
5411 }
5412
5413 /* Return true if register REGNO is a valid index register.
5414    STRICT_P is true if REG_OK_STRICT is in effect.  */
5415
5416 bool
5417 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5418 {
5419   if (!HARD_REGISTER_NUM_P (regno))
5420     {
5421       if (!strict_p)
5422         return true;
5423
5424       if (!reg_renumber)
5425         return false;
5426
5427       regno = reg_renumber[regno];
5428     }
5429   return GP_REGNUM_P (regno);
5430 }
5431
5432 /* Return true if register REGNO is a valid base register for mode MODE.
5433    STRICT_P is true if REG_OK_STRICT is in effect.  */
5434
5435 bool
5436 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5437 {
5438   if (!HARD_REGISTER_NUM_P (regno))
5439     {
5440       if (!strict_p)
5441         return true;
5442
5443       if (!reg_renumber)
5444         return false;
5445
5446       regno = reg_renumber[regno];
5447     }
5448
5449   /* The fake registers will be eliminated to either the stack or
5450      hard frame pointer, both of which are usually valid base registers.
5451      Reload deals with the cases where the eliminated form isn't valid.  */
5452   return (GP_REGNUM_P (regno)
5453           || regno == SP_REGNUM
5454           || regno == FRAME_POINTER_REGNUM
5455           || regno == ARG_POINTER_REGNUM);
5456 }
5457
5458 /* Return true if X is a valid base register for mode MODE.
5459    STRICT_P is true if REG_OK_STRICT is in effect.  */
5460
5461 static bool
5462 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5463 {
5464   if (!strict_p
5465       && GET_CODE (x) == SUBREG
5466       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5467     x = SUBREG_REG (x);
5468
5469   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5470 }
5471
5472 /* Return true if address offset is a valid index.  If it is, fill in INFO
5473    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5474
5475 static bool
5476 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5477                         machine_mode mode, bool strict_p)
5478 {
5479   enum aarch64_address_type type;
5480   rtx index;
5481   int shift;
5482
5483   /* (reg:P) */
5484   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5485       && GET_MODE (x) == Pmode)
5486     {
5487       type = ADDRESS_REG_REG;
5488       index = x;
5489       shift = 0;
5490     }
5491   /* (sign_extend:DI (reg:SI)) */
5492   else if ((GET_CODE (x) == SIGN_EXTEND
5493             || GET_CODE (x) == ZERO_EXTEND)
5494            && GET_MODE (x) == DImode
5495            && GET_MODE (XEXP (x, 0)) == SImode)
5496     {
5497       type = (GET_CODE (x) == SIGN_EXTEND)
5498         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5499       index = XEXP (x, 0);
5500       shift = 0;
5501     }
5502   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5503   else if (GET_CODE (x) == MULT
5504            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5505                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5506            && GET_MODE (XEXP (x, 0)) == DImode
5507            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5508            && CONST_INT_P (XEXP (x, 1)))
5509     {
5510       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5511         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5512       index = XEXP (XEXP (x, 0), 0);
5513       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5514     }
5515   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5516   else if (GET_CODE (x) == ASHIFT
5517            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5518                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5519            && GET_MODE (XEXP (x, 0)) == DImode
5520            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5521            && CONST_INT_P (XEXP (x, 1)))
5522     {
5523       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5524         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5525       index = XEXP (XEXP (x, 0), 0);
5526       shift = INTVAL (XEXP (x, 1));
5527     }
5528   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5529   else if ((GET_CODE (x) == SIGN_EXTRACT
5530             || GET_CODE (x) == ZERO_EXTRACT)
5531            && GET_MODE (x) == DImode
5532            && GET_CODE (XEXP (x, 0)) == MULT
5533            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5534            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5535     {
5536       type = (GET_CODE (x) == SIGN_EXTRACT)
5537         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5538       index = XEXP (XEXP (x, 0), 0);
5539       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5540       if (INTVAL (XEXP (x, 1)) != 32 + shift
5541           || INTVAL (XEXP (x, 2)) != 0)
5542         shift = -1;
5543     }
5544   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5545      (const_int 0xffffffff<<shift)) */
5546   else if (GET_CODE (x) == AND
5547            && GET_MODE (x) == DImode
5548            && GET_CODE (XEXP (x, 0)) == MULT
5549            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5550            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5551            && CONST_INT_P (XEXP (x, 1)))
5552     {
5553       type = ADDRESS_REG_UXTW;
5554       index = XEXP (XEXP (x, 0), 0);
5555       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5556       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5557         shift = -1;
5558     }
5559   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5560   else if ((GET_CODE (x) == SIGN_EXTRACT
5561             || GET_CODE (x) == ZERO_EXTRACT)
5562            && GET_MODE (x) == DImode
5563            && GET_CODE (XEXP (x, 0)) == ASHIFT
5564            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5565            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5566     {
5567       type = (GET_CODE (x) == SIGN_EXTRACT)
5568         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5569       index = XEXP (XEXP (x, 0), 0);
5570       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5571       if (INTVAL (XEXP (x, 1)) != 32 + shift
5572           || INTVAL (XEXP (x, 2)) != 0)
5573         shift = -1;
5574     }
5575   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5576      (const_int 0xffffffff<<shift)) */
5577   else if (GET_CODE (x) == AND
5578            && GET_MODE (x) == DImode
5579            && GET_CODE (XEXP (x, 0)) == ASHIFT
5580            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5581            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5582            && CONST_INT_P (XEXP (x, 1)))
5583     {
5584       type = ADDRESS_REG_UXTW;
5585       index = XEXP (XEXP (x, 0), 0);
5586       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5587       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5588         shift = -1;
5589     }
5590   /* (mult:P (reg:P) (const_int scale)) */
5591   else if (GET_CODE (x) == MULT
5592            && GET_MODE (x) == Pmode
5593            && GET_MODE (XEXP (x, 0)) == Pmode
5594            && CONST_INT_P (XEXP (x, 1)))
5595     {
5596       type = ADDRESS_REG_REG;
5597       index = XEXP (x, 0);
5598       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5599     }
5600   /* (ashift:P (reg:P) (const_int shift)) */
5601   else if (GET_CODE (x) == ASHIFT
5602            && GET_MODE (x) == Pmode
5603            && GET_MODE (XEXP (x, 0)) == Pmode
5604            && CONST_INT_P (XEXP (x, 1)))
5605     {
5606       type = ADDRESS_REG_REG;
5607       index = XEXP (x, 0);
5608       shift = INTVAL (XEXP (x, 1));
5609     }
5610   else
5611     return false;
5612
5613   if (!strict_p
5614       && GET_CODE (index) == SUBREG
5615       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5616     index = SUBREG_REG (index);
5617
5618   if (aarch64_sve_data_mode_p (mode))
5619     {
5620       if (type != ADDRESS_REG_REG
5621           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5622         return false;
5623     }
5624   else
5625     {
5626       if (shift != 0
5627           && !(IN_RANGE (shift, 1, 3)
5628                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5629         return false;
5630     }
5631
5632   if (REG_P (index)
5633       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5634     {
5635       info->type = type;
5636       info->offset = index;
5637       info->shift = shift;
5638       return true;
5639     }
5640
5641   return false;
5642 }
5643
5644 /* Return true if MODE is one of the modes for which we
5645    support LDP/STP operations.  */
5646
5647 static bool
5648 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5649 {
5650   return mode == SImode || mode == DImode
5651          || mode == SFmode || mode == DFmode
5652          || (aarch64_vector_mode_supported_p (mode)
5653              && known_eq (GET_MODE_SIZE (mode), 8));
5654 }
5655
5656 /* Return true if REGNO is a virtual pointer register, or an eliminable
5657    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5658    include stack_pointer or hard_frame_pointer.  */
5659 static bool
5660 virt_or_elim_regno_p (unsigned regno)
5661 {
5662   return ((regno >= FIRST_VIRTUAL_REGISTER
5663            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5664           || regno == FRAME_POINTER_REGNUM
5665           || regno == ARG_POINTER_REGNUM);
5666 }
5667
5668 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5669    If it is, fill in INFO appropriately.  STRICT_P is true if
5670    REG_OK_STRICT is in effect.  */
5671
5672 static bool
5673 aarch64_classify_address (struct aarch64_address_info *info,
5674                           rtx x, machine_mode mode, bool strict_p,
5675                           aarch64_addr_query_type type = ADDR_QUERY_M)
5676 {
5677   enum rtx_code code = GET_CODE (x);
5678   rtx op0, op1;
5679   poly_int64 offset;
5680
5681   HOST_WIDE_INT const_size;
5682
5683   /* On BE, we use load/store pair for all large int mode load/stores.
5684      TI/TFmode may also use a load/store pair.  */
5685   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5686   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5687   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5688                             || mode == TImode
5689                             || mode == TFmode
5690                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5691
5692   bool allow_reg_index_p = (!load_store_pair_p
5693                             && (known_lt (GET_MODE_SIZE (mode), 16)
5694                                 || vec_flags == VEC_ADVSIMD
5695                                 || vec_flags == VEC_SVE_DATA));
5696
5697   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5698      [Rn, #offset, MUL VL].  */
5699   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5700       && (code != REG && code != PLUS))
5701     return false;
5702
5703   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5704      REG addressing.  */
5705   if (advsimd_struct_p
5706       && !BYTES_BIG_ENDIAN
5707       && (code != POST_INC && code != REG))
5708     return false;
5709
5710   gcc_checking_assert (GET_MODE (x) == VOIDmode
5711                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5712
5713   switch (code)
5714     {
5715     case REG:
5716     case SUBREG:
5717       info->type = ADDRESS_REG_IMM;
5718       info->base = x;
5719       info->offset = const0_rtx;
5720       info->const_offset = 0;
5721       return aarch64_base_register_rtx_p (x, strict_p);
5722
5723     case PLUS:
5724       op0 = XEXP (x, 0);
5725       op1 = XEXP (x, 1);
5726
5727       if (! strict_p
5728           && REG_P (op0)
5729           && virt_or_elim_regno_p (REGNO (op0))
5730           && poly_int_rtx_p (op1, &offset))
5731         {
5732           info->type = ADDRESS_REG_IMM;
5733           info->base = op0;
5734           info->offset = op1;
5735           info->const_offset = offset;
5736
5737           return true;
5738         }
5739
5740       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5741           && aarch64_base_register_rtx_p (op0, strict_p)
5742           && poly_int_rtx_p (op1, &offset))
5743         {
5744           info->type = ADDRESS_REG_IMM;
5745           info->base = op0;
5746           info->offset = op1;
5747           info->const_offset = offset;
5748
5749           /* TImode and TFmode values are allowed in both pairs of X
5750              registers and individual Q registers.  The available
5751              address modes are:
5752              X,X: 7-bit signed scaled offset
5753              Q:   9-bit signed offset
5754              We conservatively require an offset representable in either mode.
5755              When performing the check for pairs of X registers i.e.  LDP/STP
5756              pass down DImode since that is the natural size of the LDP/STP
5757              instruction memory accesses.  */
5758           if (mode == TImode || mode == TFmode)
5759             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5760                     && (offset_9bit_signed_unscaled_p (mode, offset)
5761                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5762
5763           /* A 7bit offset check because OImode will emit a ldp/stp
5764              instruction (only big endian will get here).
5765              For ldp/stp instructions, the offset is scaled for the size of a
5766              single element of the pair.  */
5767           if (mode == OImode)
5768             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5769
5770           /* Three 9/12 bit offsets checks because CImode will emit three
5771              ldr/str instructions (only big endian will get here).  */
5772           if (mode == CImode)
5773             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5774                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5775                         || offset_12bit_unsigned_scaled_p (V16QImode,
5776                                                            offset + 32)));
5777
5778           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5779              instructions (only big endian will get here).  */
5780           if (mode == XImode)
5781             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5782                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5783                                                             offset + 32));
5784
5785           /* Make "m" use the LD1 offset range for SVE data modes, so
5786              that pre-RTL optimizers like ivopts will work to that
5787              instead of the wider LDR/STR range.  */
5788           if (vec_flags == VEC_SVE_DATA)
5789             return (type == ADDR_QUERY_M
5790                     ? offset_4bit_signed_scaled_p (mode, offset)
5791                     : offset_9bit_signed_scaled_p (mode, offset));
5792
5793           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5794             {
5795               poly_int64 end_offset = (offset
5796                                        + GET_MODE_SIZE (mode)
5797                                        - BYTES_PER_SVE_VECTOR);
5798               return (type == ADDR_QUERY_M
5799                       ? offset_4bit_signed_scaled_p (mode, offset)
5800                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5801                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5802                                                          end_offset)));
5803             }
5804
5805           if (vec_flags == VEC_SVE_PRED)
5806             return offset_9bit_signed_scaled_p (mode, offset);
5807
5808           if (load_store_pair_p)
5809             return ((known_eq (GET_MODE_SIZE (mode), 4)
5810                      || known_eq (GET_MODE_SIZE (mode), 8))
5811                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5812           else
5813             return (offset_9bit_signed_unscaled_p (mode, offset)
5814                     || offset_12bit_unsigned_scaled_p (mode, offset));
5815         }
5816
5817       if (allow_reg_index_p)
5818         {
5819           /* Look for base + (scaled/extended) index register.  */
5820           if (aarch64_base_register_rtx_p (op0, strict_p)
5821               && aarch64_classify_index (info, op1, mode, strict_p))
5822             {
5823               info->base = op0;
5824               return true;
5825             }
5826           if (aarch64_base_register_rtx_p (op1, strict_p)
5827               && aarch64_classify_index (info, op0, mode, strict_p))
5828             {
5829               info->base = op1;
5830               return true;
5831             }
5832         }
5833
5834       return false;
5835
5836     case POST_INC:
5837     case POST_DEC:
5838     case PRE_INC:
5839     case PRE_DEC:
5840       info->type = ADDRESS_REG_WB;
5841       info->base = XEXP (x, 0);
5842       info->offset = NULL_RTX;
5843       return aarch64_base_register_rtx_p (info->base, strict_p);
5844
5845     case POST_MODIFY:
5846     case PRE_MODIFY:
5847       info->type = ADDRESS_REG_WB;
5848       info->base = XEXP (x, 0);
5849       if (GET_CODE (XEXP (x, 1)) == PLUS
5850           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5851           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5852           && aarch64_base_register_rtx_p (info->base, strict_p))
5853         {
5854           info->offset = XEXP (XEXP (x, 1), 1);
5855           info->const_offset = offset;
5856
5857           /* TImode and TFmode values are allowed in both pairs of X
5858              registers and individual Q registers.  The available
5859              address modes are:
5860              X,X: 7-bit signed scaled offset
5861              Q:   9-bit signed offset
5862              We conservatively require an offset representable in either mode.
5863            */
5864           if (mode == TImode || mode == TFmode)
5865             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5866                     && offset_9bit_signed_unscaled_p (mode, offset));
5867
5868           if (load_store_pair_p)
5869             return ((known_eq (GET_MODE_SIZE (mode), 4)
5870                      || known_eq (GET_MODE_SIZE (mode), 8))
5871                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5872           else
5873             return offset_9bit_signed_unscaled_p (mode, offset);
5874         }
5875       return false;
5876
5877     case CONST:
5878     case SYMBOL_REF:
5879     case LABEL_REF:
5880       /* load literal: pc-relative constant pool entry.  Only supported
5881          for SI mode or larger.  */
5882       info->type = ADDRESS_SYMBOLIC;
5883
5884       if (!load_store_pair_p
5885           && GET_MODE_SIZE (mode).is_constant (&const_size)
5886           && const_size >= 4)
5887         {
5888           rtx sym, addend;
5889
5890           split_const (x, &sym, &addend);
5891           return ((GET_CODE (sym) == LABEL_REF
5892                    || (GET_CODE (sym) == SYMBOL_REF
5893                        && CONSTANT_POOL_ADDRESS_P (sym)
5894                        && aarch64_pcrelative_literal_loads)));
5895         }
5896       return false;
5897
5898     case LO_SUM:
5899       info->type = ADDRESS_LO_SUM;
5900       info->base = XEXP (x, 0);
5901       info->offset = XEXP (x, 1);
5902       if (allow_reg_index_p
5903           && aarch64_base_register_rtx_p (info->base, strict_p))
5904         {
5905           rtx sym, offs;
5906           split_const (info->offset, &sym, &offs);
5907           if (GET_CODE (sym) == SYMBOL_REF
5908               && (aarch64_classify_symbol (sym, INTVAL (offs))
5909                   == SYMBOL_SMALL_ABSOLUTE))
5910             {
5911               /* The symbol and offset must be aligned to the access size.  */
5912               unsigned int align;
5913
5914               if (CONSTANT_POOL_ADDRESS_P (sym))
5915                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5916               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5917                 {
5918                   tree exp = SYMBOL_REF_DECL (sym);
5919                   align = TYPE_ALIGN (TREE_TYPE (exp));
5920                   align = aarch64_constant_alignment (exp, align);
5921                 }
5922               else if (SYMBOL_REF_DECL (sym))
5923                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5924               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5925                        && SYMBOL_REF_BLOCK (sym) != NULL)
5926                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5927               else
5928                 align = BITS_PER_UNIT;
5929
5930               poly_int64 ref_size = GET_MODE_SIZE (mode);
5931               if (known_eq (ref_size, 0))
5932                 ref_size = GET_MODE_SIZE (DImode);
5933
5934               return (multiple_p (INTVAL (offs), ref_size)
5935                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5936             }
5937         }
5938       return false;
5939
5940     default:
5941       return false;
5942     }
5943 }
5944
5945 /* Return true if the address X is valid for a PRFM instruction.
5946    STRICT_P is true if we should do strict checking with
5947    aarch64_classify_address.  */
5948
5949 bool
5950 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5951 {
5952   struct aarch64_address_info addr;
5953
5954   /* PRFM accepts the same addresses as DImode...  */
5955   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5956   if (!res)
5957     return false;
5958
5959   /* ... except writeback forms.  */
5960   return addr.type != ADDRESS_REG_WB;
5961 }
5962
5963 bool
5964 aarch64_symbolic_address_p (rtx x)
5965 {
5966   rtx offset;
5967
5968   split_const (x, &x, &offset);
5969   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5970 }
5971
5972 /* Classify the base of symbolic expression X.  */
5973
5974 enum aarch64_symbol_type
5975 aarch64_classify_symbolic_expression (rtx x)
5976 {
5977   rtx offset;
5978
5979   split_const (x, &x, &offset);
5980   return aarch64_classify_symbol (x, INTVAL (offset));
5981 }
5982
5983
5984 /* Return TRUE if X is a legitimate address for accessing memory in
5985    mode MODE.  */
5986 static bool
5987 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5988 {
5989   struct aarch64_address_info addr;
5990
5991   return aarch64_classify_address (&addr, x, mode, strict_p);
5992 }
5993
5994 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5995    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5996 bool
5997 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5998                               aarch64_addr_query_type type)
5999 {
6000   struct aarch64_address_info addr;
6001
6002   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6003 }
6004
6005 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6006
6007 static bool
6008 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6009                                          poly_int64 orig_offset,
6010                                          machine_mode mode)
6011 {
6012   HOST_WIDE_INT size;
6013   if (GET_MODE_SIZE (mode).is_constant (&size))
6014     {
6015       HOST_WIDE_INT const_offset, second_offset;
6016
6017       /* A general SVE offset is A * VQ + B.  Remove the A component from
6018          coefficient 0 in order to get the constant B.  */
6019       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6020
6021       /* Split an out-of-range address displacement into a base and
6022          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6023          range otherwise to increase opportunities for sharing the base
6024          address of different sizes.  Unaligned accesses use the signed
6025          9-bit range, TImode/TFmode use the intersection of signed
6026          scaled 7-bit and signed 9-bit offset.  */
6027       if (mode == TImode || mode == TFmode)
6028         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6029       else if ((const_offset & (size - 1)) != 0)
6030         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6031       else
6032         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6033
6034       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6035         return false;
6036
6037       /* Split the offset into second_offset and the rest.  */
6038       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6039       *offset2 = gen_int_mode (second_offset, Pmode);
6040       return true;
6041     }
6042   else
6043     {
6044       /* Get the mode we should use as the basis of the range.  For structure
6045          modes this is the mode of one vector.  */
6046       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6047       machine_mode step_mode
6048         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6049
6050       /* Get the "mul vl" multiplier we'd like to use.  */
6051       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6052       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6053       if (vec_flags & VEC_SVE_DATA)
6054         /* LDR supports a 9-bit range, but the move patterns for
6055            structure modes require all vectors to be in range of the
6056            same base.  The simplest way of accomodating that while still
6057            promoting reuse of anchor points between different modes is
6058            to use an 8-bit range unconditionally.  */
6059         vnum = ((vnum + 128) & 255) - 128;
6060       else
6061         /* Predicates are only handled singly, so we might as well use
6062            the full range.  */
6063         vnum = ((vnum + 256) & 511) - 256;
6064       if (vnum == 0)
6065         return false;
6066
6067       /* Convert the "mul vl" multiplier into a byte offset.  */
6068       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6069       if (known_eq (second_offset, orig_offset))
6070         return false;
6071
6072       /* Split the offset into second_offset and the rest.  */
6073       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6074       *offset2 = gen_int_mode (second_offset, Pmode);
6075       return true;
6076     }
6077 }
6078
6079 /* Return the binary representation of floating point constant VALUE in INTVAL.
6080    If the value cannot be converted, return false without setting INTVAL.
6081    The conversion is done in the given MODE.  */
6082 bool
6083 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6084 {
6085
6086   /* We make a general exception for 0.  */
6087   if (aarch64_float_const_zero_rtx_p (value))
6088     {
6089       *intval = 0;
6090       return true;
6091     }
6092
6093   scalar_float_mode mode;
6094   if (GET_CODE (value) != CONST_DOUBLE
6095       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6096       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6097       /* Only support up to DF mode.  */
6098       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6099     return false;
6100
6101   unsigned HOST_WIDE_INT ival = 0;
6102
6103   long res[2];
6104   real_to_target (res,
6105                   CONST_DOUBLE_REAL_VALUE (value),
6106                   REAL_MODE_FORMAT (mode));
6107
6108   if (mode == DFmode)
6109     {
6110       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6111       ival = zext_hwi (res[order], 32);
6112       ival |= (zext_hwi (res[1 - order], 32) << 32);
6113     }
6114   else
6115       ival = zext_hwi (res[0], 32);
6116
6117   *intval = ival;
6118   return true;
6119 }
6120
6121 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6122    single MOV(+MOVK) followed by an FMOV.  */
6123 bool
6124 aarch64_float_const_rtx_p (rtx x)
6125 {
6126   machine_mode mode = GET_MODE (x);
6127   if (mode == VOIDmode)
6128     return false;
6129
6130   /* Determine whether it's cheaper to write float constants as
6131      mov/movk pairs over ldr/adrp pairs.  */
6132   unsigned HOST_WIDE_INT ival;
6133
6134   if (GET_CODE (x) == CONST_DOUBLE
6135       && SCALAR_FLOAT_MODE_P (mode)
6136       && aarch64_reinterpret_float_as_int (x, &ival))
6137     {
6138       scalar_int_mode imode = (mode == HFmode
6139                                ? SImode
6140                                : int_mode_for_mode (mode).require ());
6141       int num_instr = aarch64_internal_mov_immediate
6142                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6143       return num_instr < 3;
6144     }
6145
6146   return false;
6147 }
6148
6149 /* Return TRUE if rtx X is immediate constant 0.0 */
6150 bool
6151 aarch64_float_const_zero_rtx_p (rtx x)
6152 {
6153   if (GET_MODE (x) == VOIDmode)
6154     return false;
6155
6156   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6157     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6158   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6159 }
6160
6161 /* Return TRUE if rtx X is immediate constant that fits in a single
6162    MOVI immediate operation.  */
6163 bool
6164 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6165 {
6166   if (!TARGET_SIMD)
6167      return false;
6168
6169   machine_mode vmode;
6170   scalar_int_mode imode;
6171   unsigned HOST_WIDE_INT ival;
6172
6173   if (GET_CODE (x) == CONST_DOUBLE
6174       && SCALAR_FLOAT_MODE_P (mode))
6175     {
6176       if (!aarch64_reinterpret_float_as_int (x, &ival))
6177         return false;
6178
6179       /* We make a general exception for 0.  */
6180       if (aarch64_float_const_zero_rtx_p (x))
6181         return true;
6182
6183       imode = int_mode_for_mode (mode).require ();
6184     }
6185   else if (GET_CODE (x) == CONST_INT
6186            && is_a <scalar_int_mode> (mode, &imode))
6187     ival = INTVAL (x);
6188   else
6189     return false;
6190
6191    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6192      a 128 bit vector mode.  */
6193   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6194
6195   vmode = aarch64_simd_container_mode (imode, width);
6196   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6197
6198   return aarch64_simd_valid_immediate (v_op, NULL);
6199 }
6200
6201
6202 /* Return the fixed registers used for condition codes.  */
6203
6204 static bool
6205 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6206 {
6207   *p1 = CC_REGNUM;
6208   *p2 = INVALID_REGNUM;
6209   return true;
6210 }
6211
6212 /* This function is used by the call expanders of the machine description.
6213    RESULT is the register in which the result is returned.  It's NULL for
6214    "call" and "sibcall".
6215    MEM is the location of the function call.
6216    SIBCALL indicates whether this function call is normal call or sibling call.
6217    It will generate different pattern accordingly.  */
6218
6219 void
6220 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6221 {
6222   rtx call, callee, tmp;
6223   rtvec vec;
6224   machine_mode mode;
6225
6226   gcc_assert (MEM_P (mem));
6227   callee = XEXP (mem, 0);
6228   mode = GET_MODE (callee);
6229   gcc_assert (mode == Pmode);
6230
6231   /* Decide if we should generate indirect calls by loading the
6232      address of the callee into a register before performing
6233      the branch-and-link.  */
6234   if (SYMBOL_REF_P (callee)
6235       ? (aarch64_is_long_call_p (callee)
6236          || aarch64_is_noplt_call_p (callee))
6237       : !REG_P (callee))
6238     XEXP (mem, 0) = force_reg (mode, callee);
6239
6240   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6241
6242   if (result != NULL_RTX)
6243     call = gen_rtx_SET (result, call);
6244
6245   if (sibcall)
6246     tmp = ret_rtx;
6247   else
6248     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6249
6250   vec = gen_rtvec (2, call, tmp);
6251   call = gen_rtx_PARALLEL (VOIDmode, vec);
6252
6253   aarch64_emit_call_insn (call);
6254 }
6255
6256 /* Emit call insn with PAT and do aarch64-specific handling.  */
6257
6258 void
6259 aarch64_emit_call_insn (rtx pat)
6260 {
6261   rtx insn = emit_call_insn (pat);
6262
6263   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6264   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6265   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6266 }
6267
6268 machine_mode
6269 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6270 {
6271   /* All floating point compares return CCFP if it is an equality
6272      comparison, and CCFPE otherwise.  */
6273   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6274     {
6275       switch (code)
6276         {
6277         case EQ:
6278         case NE:
6279         case UNORDERED:
6280         case ORDERED:
6281         case UNLT:
6282         case UNLE:
6283         case UNGT:
6284         case UNGE:
6285         case UNEQ:
6286           return CCFPmode;
6287
6288         case LT:
6289         case LE:
6290         case GT:
6291         case GE:
6292         case LTGT:
6293           return CCFPEmode;
6294
6295         default:
6296           gcc_unreachable ();
6297         }
6298     }
6299
6300   /* Equality comparisons of short modes against zero can be performed
6301      using the TST instruction with the appropriate bitmask.  */
6302   if (y == const0_rtx && REG_P (x)
6303       && (code == EQ || code == NE)
6304       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6305     return CC_NZmode;
6306
6307   /* Similarly, comparisons of zero_extends from shorter modes can
6308      be performed using an ANDS with an immediate mask.  */
6309   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6310       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6311       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6312       && (code == EQ || code == NE))
6313     return CC_NZmode;
6314
6315   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6316       && y == const0_rtx
6317       && (code == EQ || code == NE || code == LT || code == GE)
6318       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6319           || GET_CODE (x) == NEG
6320           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6321               && CONST_INT_P (XEXP (x, 2)))))
6322     return CC_NZmode;
6323
6324   /* A compare with a shifted operand.  Because of canonicalization,
6325      the comparison will have to be swapped when we emit the assembly
6326      code.  */
6327   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6328       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6329       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6330           || GET_CODE (x) == LSHIFTRT
6331           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6332     return CC_SWPmode;
6333
6334   /* Similarly for a negated operand, but we can only do this for
6335      equalities.  */
6336   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6337       && (REG_P (y) || GET_CODE (y) == SUBREG)
6338       && (code == EQ || code == NE)
6339       && GET_CODE (x) == NEG)
6340     return CC_Zmode;
6341
6342   /* A test for unsigned overflow.  */
6343   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6344       && code == NE
6345       && GET_CODE (x) == PLUS
6346       && GET_CODE (y) == ZERO_EXTEND)
6347     return CC_Cmode;
6348
6349   /* For everything else, return CCmode.  */
6350   return CCmode;
6351 }
6352
6353 static int
6354 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6355
6356 int
6357 aarch64_get_condition_code (rtx x)
6358 {
6359   machine_mode mode = GET_MODE (XEXP (x, 0));
6360   enum rtx_code comp_code = GET_CODE (x);
6361
6362   if (GET_MODE_CLASS (mode) != MODE_CC)
6363     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6364   return aarch64_get_condition_code_1 (mode, comp_code);
6365 }
6366
6367 static int
6368 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6369 {
6370   switch (mode)
6371     {
6372     case E_CCFPmode:
6373     case E_CCFPEmode:
6374       switch (comp_code)
6375         {
6376         case GE: return AARCH64_GE;
6377         case GT: return AARCH64_GT;
6378         case LE: return AARCH64_LS;
6379         case LT: return AARCH64_MI;
6380         case NE: return AARCH64_NE;
6381         case EQ: return AARCH64_EQ;
6382         case ORDERED: return AARCH64_VC;
6383         case UNORDERED: return AARCH64_VS;
6384         case UNLT: return AARCH64_LT;
6385         case UNLE: return AARCH64_LE;
6386         case UNGT: return AARCH64_HI;
6387         case UNGE: return AARCH64_PL;
6388         default: return -1;
6389         }
6390       break;
6391
6392     case E_CCmode:
6393       switch (comp_code)
6394         {
6395         case NE: return AARCH64_NE;
6396         case EQ: return AARCH64_EQ;
6397         case GE: return AARCH64_GE;
6398         case GT: return AARCH64_GT;
6399         case LE: return AARCH64_LE;
6400         case LT: return AARCH64_LT;
6401         case GEU: return AARCH64_CS;
6402         case GTU: return AARCH64_HI;
6403         case LEU: return AARCH64_LS;
6404         case LTU: return AARCH64_CC;
6405         default: return -1;
6406         }
6407       break;
6408
6409     case E_CC_SWPmode:
6410       switch (comp_code)
6411         {
6412         case NE: return AARCH64_NE;
6413         case EQ: return AARCH64_EQ;
6414         case GE: return AARCH64_LE;
6415         case GT: return AARCH64_LT;
6416         case LE: return AARCH64_GE;
6417         case LT: return AARCH64_GT;
6418         case GEU: return AARCH64_LS;
6419         case GTU: return AARCH64_CC;
6420         case LEU: return AARCH64_CS;
6421         case LTU: return AARCH64_HI;
6422         default: return -1;
6423         }
6424       break;
6425
6426     case E_CC_NZmode:
6427       switch (comp_code)
6428         {
6429         case NE: return AARCH64_NE;
6430         case EQ: return AARCH64_EQ;
6431         case GE: return AARCH64_PL;
6432         case LT: return AARCH64_MI;
6433         default: return -1;
6434         }
6435       break;
6436
6437     case E_CC_Zmode:
6438       switch (comp_code)
6439         {
6440         case NE: return AARCH64_NE;
6441         case EQ: return AARCH64_EQ;
6442         default: return -1;
6443         }
6444       break;
6445
6446     case E_CC_Cmode:
6447       switch (comp_code)
6448         {
6449         case NE: return AARCH64_CS;
6450         case EQ: return AARCH64_CC;
6451         default: return -1;
6452         }
6453       break;
6454
6455     default:
6456       return -1;
6457     }
6458
6459   return -1;
6460 }
6461
6462 bool
6463 aarch64_const_vec_all_same_in_range_p (rtx x,
6464                                        HOST_WIDE_INT minval,
6465                                        HOST_WIDE_INT maxval)
6466 {
6467   rtx elt;
6468   return (const_vec_duplicate_p (x, &elt)
6469           && CONST_INT_P (elt)
6470           && IN_RANGE (INTVAL (elt), minval, maxval));
6471 }
6472
6473 bool
6474 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6475 {
6476   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6477 }
6478
6479 /* Return true if VEC is a constant in which every element is in the range
6480    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6481
6482 static bool
6483 aarch64_const_vec_all_in_range_p (rtx vec,
6484                                   HOST_WIDE_INT minval,
6485                                   HOST_WIDE_INT maxval)
6486 {
6487   if (GET_CODE (vec) != CONST_VECTOR
6488       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6489     return false;
6490
6491   int nunits;
6492   if (!CONST_VECTOR_STEPPED_P (vec))
6493     nunits = const_vector_encoded_nelts (vec);
6494   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6495     return false;
6496
6497   for (int i = 0; i < nunits; i++)
6498     {
6499       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6500       if (!CONST_INT_P (vec_elem)
6501           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6502         return false;
6503     }
6504   return true;
6505 }
6506
6507 /* N Z C V.  */
6508 #define AARCH64_CC_V 1
6509 #define AARCH64_CC_C (1 << 1)
6510 #define AARCH64_CC_Z (1 << 2)
6511 #define AARCH64_CC_N (1 << 3)
6512
6513 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6514 static const int aarch64_nzcv_codes[] =
6515 {
6516   0,            /* EQ, Z == 1.  */
6517   AARCH64_CC_Z, /* NE, Z == 0.  */
6518   0,            /* CS, C == 1.  */
6519   AARCH64_CC_C, /* CC, C == 0.  */
6520   0,            /* MI, N == 1.  */
6521   AARCH64_CC_N, /* PL, N == 0.  */
6522   0,            /* VS, V == 1.  */
6523   AARCH64_CC_V, /* VC, V == 0.  */
6524   0,            /* HI, C ==1 && Z == 0.  */
6525   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6526   AARCH64_CC_V, /* GE, N == V.  */
6527   0,            /* LT, N != V.  */
6528   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6529   0,            /* LE, !(Z == 0 && N == V).  */
6530   0,            /* AL, Any.  */
6531   0             /* NV, Any.  */
6532 };
6533
6534 /* Print floating-point vector immediate operand X to F, negating it
6535    first if NEGATE is true.  Return true on success, false if it isn't
6536    a constant we can handle.  */
6537
6538 static bool
6539 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6540 {
6541   rtx elt;
6542
6543   if (!const_vec_duplicate_p (x, &elt))
6544     return false;
6545
6546   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6547   if (negate)
6548     r = real_value_negate (&r);
6549
6550   /* We only handle the SVE single-bit immediates here.  */
6551   if (real_equal (&r, &dconst0))
6552     asm_fprintf (f, "0.0");
6553   else if (real_equal (&r, &dconst1))
6554     asm_fprintf (f, "1.0");
6555   else if (real_equal (&r, &dconsthalf))
6556     asm_fprintf (f, "0.5");
6557   else
6558     return false;
6559
6560   return true;
6561 }
6562
6563 /* Return the equivalent letter for size.  */
6564 static char
6565 sizetochar (int size)
6566 {
6567   switch (size)
6568     {
6569     case 64: return 'd';
6570     case 32: return 's';
6571     case 16: return 'h';
6572     case 8 : return 'b';
6573     default: gcc_unreachable ();
6574     }
6575 }
6576
6577 /* Print operand X to file F in a target specific manner according to CODE.
6578    The acceptable formatting commands given by CODE are:
6579      'c':               An integer or symbol address without a preceding #
6580                         sign.
6581      'C':               Take the duplicated element in a vector constant
6582                         and print it in hex.
6583      'D':               Take the duplicated element in a vector constant
6584                         and print it as an unsigned integer, in decimal.
6585      'e':               Print the sign/zero-extend size as a character 8->b,
6586                         16->h, 32->w.
6587      'p':               Prints N such that 2^N == X (X must be power of 2 and
6588                         const int).
6589      'P':               Print the number of non-zero bits in X (a const_int).
6590      'H':               Print the higher numbered register of a pair (TImode)
6591                         of regs.
6592      'm':               Print a condition (eq, ne, etc).
6593      'M':               Same as 'm', but invert condition.
6594      'N':               Take the duplicated element in a vector constant
6595                         and print the negative of it in decimal.
6596      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6597      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6598                         The register printed is the FP/SIMD register name
6599                         of X + 0/1/2/3 for S/T/U/V.
6600      'R':               Print a scalar FP/SIMD register name + 1.
6601      'X':               Print bottom 16 bits of integer constant in hex.
6602      'w/x':             Print a general register name or the zero register
6603                         (32-bit or 64-bit).
6604      '0':               Print a normal operand, if it's a general register,
6605                         then we assume DImode.
6606      'k':               Print NZCV for conditional compare instructions.
6607      'A':               Output address constant representing the first
6608                         argument of X, specifying a relocation offset
6609                         if appropriate.
6610      'L':               Output constant address specified by X
6611                         with a relocation offset if appropriate.
6612      'G':               Prints address of X, specifying a PC relative
6613                         relocation mode if appropriate.
6614      'y':               Output address of LDP or STP - this is used for
6615                         some LDP/STPs which don't use a PARALLEL in their
6616                         pattern (so the mode needs to be adjusted).
6617      'z':               Output address of a typical LDP or STP.  */
6618
6619 static void
6620 aarch64_print_operand (FILE *f, rtx x, int code)
6621 {
6622   rtx elt;
6623   switch (code)
6624     {
6625     case 'c':
6626       switch (GET_CODE (x))
6627         {
6628         case CONST_INT:
6629           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6630           break;
6631
6632         case SYMBOL_REF:
6633           output_addr_const (f, x);
6634           break;
6635
6636         case CONST:
6637           if (GET_CODE (XEXP (x, 0)) == PLUS
6638               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6639             {
6640               output_addr_const (f, x);
6641               break;
6642             }
6643           /* Fall through.  */
6644
6645         default:
6646           output_operand_lossage ("unsupported operand for code '%c'", code);
6647         }
6648       break;
6649
6650     case 'e':
6651       {
6652         int n;
6653
6654         if (!CONST_INT_P (x)
6655             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6656           {
6657             output_operand_lossage ("invalid operand for '%%%c'", code);
6658             return;
6659           }
6660
6661         switch (n)
6662           {
6663           case 3:
6664             fputc ('b', f);
6665             break;
6666           case 4:
6667             fputc ('h', f);
6668             break;
6669           case 5:
6670             fputc ('w', f);
6671             break;
6672           default:
6673             output_operand_lossage ("invalid operand for '%%%c'", code);
6674             return;
6675           }
6676       }
6677       break;
6678
6679     case 'p':
6680       {
6681         int n;
6682
6683         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6684           {
6685             output_operand_lossage ("invalid operand for '%%%c'", code);
6686             return;
6687           }
6688
6689         asm_fprintf (f, "%d", n);
6690       }
6691       break;
6692
6693     case 'P':
6694       if (!CONST_INT_P (x))
6695         {
6696           output_operand_lossage ("invalid operand for '%%%c'", code);
6697           return;
6698         }
6699
6700       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6701       break;
6702
6703     case 'H':
6704       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6705         {
6706           output_operand_lossage ("invalid operand for '%%%c'", code);
6707           return;
6708         }
6709
6710       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6711       break;
6712
6713     case 'M':
6714     case 'm':
6715       {
6716         int cond_code;
6717         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6718         if (x == const_true_rtx)
6719           {
6720             if (code == 'M')
6721               fputs ("nv", f);
6722             return;
6723           }
6724
6725         if (!COMPARISON_P (x))
6726           {
6727             output_operand_lossage ("invalid operand for '%%%c'", code);
6728             return;
6729           }
6730
6731         cond_code = aarch64_get_condition_code (x);
6732         gcc_assert (cond_code >= 0);
6733         if (code == 'M')
6734           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6735         fputs (aarch64_condition_codes[cond_code], f);
6736       }
6737       break;
6738
6739     case 'N':
6740       if (!const_vec_duplicate_p (x, &elt))
6741         {
6742           output_operand_lossage ("invalid vector constant");
6743           return;
6744         }
6745
6746       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6747         asm_fprintf (f, "%wd", -INTVAL (elt));
6748       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6749                && aarch64_print_vector_float_operand (f, x, true))
6750         ;
6751       else
6752         {
6753           output_operand_lossage ("invalid vector constant");
6754           return;
6755         }
6756       break;
6757
6758     case 'b':
6759     case 'h':
6760     case 's':
6761     case 'd':
6762     case 'q':
6763       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6764         {
6765           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6766           return;
6767         }
6768       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6769       break;
6770
6771     case 'S':
6772     case 'T':
6773     case 'U':
6774     case 'V':
6775       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6776         {
6777           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6778           return;
6779         }
6780       asm_fprintf (f, "%c%d",
6781                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6782                    REGNO (x) - V0_REGNUM + (code - 'S'));
6783       break;
6784
6785     case 'R':
6786       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6787         {
6788           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6789           return;
6790         }
6791       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6792       break;
6793
6794     case 'X':
6795       if (!CONST_INT_P (x))
6796         {
6797           output_operand_lossage ("invalid operand for '%%%c'", code);
6798           return;
6799         }
6800       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6801       break;
6802
6803     case 'C':
6804       {
6805         /* Print a replicated constant in hex.  */
6806         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6807           {
6808             output_operand_lossage ("invalid operand for '%%%c'", code);
6809             return;
6810           }
6811         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6812         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6813       }
6814       break;
6815
6816     case 'D':
6817       {
6818         /* Print a replicated constant in decimal, treating it as
6819            unsigned.  */
6820         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6821           {
6822             output_operand_lossage ("invalid operand for '%%%c'", code);
6823             return;
6824           }
6825         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6826         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6827       }
6828       break;
6829
6830     case 'w':
6831     case 'x':
6832       if (x == const0_rtx
6833           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6834         {
6835           asm_fprintf (f, "%czr", code);
6836           break;
6837         }
6838
6839       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6840         {
6841           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6842           break;
6843         }
6844
6845       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6846         {
6847           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6848           break;
6849         }
6850
6851       /* Fall through */
6852
6853     case 0:
6854       if (x == NULL)
6855         {
6856           output_operand_lossage ("missing operand");
6857           return;
6858         }
6859
6860       switch (GET_CODE (x))
6861         {
6862         case REG:
6863           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6864             {
6865               if (REG_NREGS (x) == 1)
6866                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6867               else
6868                 {
6869                   char suffix
6870                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6871                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6872                                REGNO (x) - V0_REGNUM, suffix,
6873                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6874                 }
6875             }
6876           else
6877             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6878           break;
6879
6880         case MEM:
6881           output_address (GET_MODE (x), XEXP (x, 0));
6882           break;
6883
6884         case LABEL_REF:
6885         case SYMBOL_REF:
6886           output_addr_const (asm_out_file, x);
6887           break;
6888
6889         case CONST_INT:
6890           asm_fprintf (f, "%wd", INTVAL (x));
6891           break;
6892
6893         case CONST:
6894           if (!VECTOR_MODE_P (GET_MODE (x)))
6895             {
6896               output_addr_const (asm_out_file, x);
6897               break;
6898             }
6899           /* fall through */
6900
6901         case CONST_VECTOR:
6902           if (!const_vec_duplicate_p (x, &elt))
6903             {
6904               output_operand_lossage ("invalid vector constant");
6905               return;
6906             }
6907
6908           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6909             asm_fprintf (f, "%wd", INTVAL (elt));
6910           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6911                    && aarch64_print_vector_float_operand (f, x, false))
6912             ;
6913           else
6914             {
6915               output_operand_lossage ("invalid vector constant");
6916               return;
6917             }
6918           break;
6919
6920         case CONST_DOUBLE:
6921           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6922              be getting CONST_DOUBLEs holding integers.  */
6923           gcc_assert (GET_MODE (x) != VOIDmode);
6924           if (aarch64_float_const_zero_rtx_p (x))
6925             {
6926               fputc ('0', f);
6927               break;
6928             }
6929           else if (aarch64_float_const_representable_p (x))
6930             {
6931 #define buf_size 20
6932               char float_buf[buf_size] = {'\0'};
6933               real_to_decimal_for_mode (float_buf,
6934                                         CONST_DOUBLE_REAL_VALUE (x),
6935                                         buf_size, buf_size,
6936                                         1, GET_MODE (x));
6937               asm_fprintf (asm_out_file, "%s", float_buf);
6938               break;
6939 #undef buf_size
6940             }
6941           output_operand_lossage ("invalid constant");
6942           return;
6943         default:
6944           output_operand_lossage ("invalid operand");
6945           return;
6946         }
6947       break;
6948
6949     case 'A':
6950       if (GET_CODE (x) == HIGH)
6951         x = XEXP (x, 0);
6952
6953       switch (aarch64_classify_symbolic_expression (x))
6954         {
6955         case SYMBOL_SMALL_GOT_4G:
6956           asm_fprintf (asm_out_file, ":got:");
6957           break;
6958
6959         case SYMBOL_SMALL_TLSGD:
6960           asm_fprintf (asm_out_file, ":tlsgd:");
6961           break;
6962
6963         case SYMBOL_SMALL_TLSDESC:
6964           asm_fprintf (asm_out_file, ":tlsdesc:");
6965           break;
6966
6967         case SYMBOL_SMALL_TLSIE:
6968           asm_fprintf (asm_out_file, ":gottprel:");
6969           break;
6970
6971         case SYMBOL_TLSLE24:
6972           asm_fprintf (asm_out_file, ":tprel:");
6973           break;
6974
6975         case SYMBOL_TINY_GOT:
6976           gcc_unreachable ();
6977           break;
6978
6979         default:
6980           break;
6981         }
6982       output_addr_const (asm_out_file, x);
6983       break;
6984
6985     case 'L':
6986       switch (aarch64_classify_symbolic_expression (x))
6987         {
6988         case SYMBOL_SMALL_GOT_4G:
6989           asm_fprintf (asm_out_file, ":lo12:");
6990           break;
6991
6992         case SYMBOL_SMALL_TLSGD:
6993           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6994           break;
6995
6996         case SYMBOL_SMALL_TLSDESC:
6997           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6998           break;
6999
7000         case SYMBOL_SMALL_TLSIE:
7001           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7002           break;
7003
7004         case SYMBOL_TLSLE12:
7005           asm_fprintf (asm_out_file, ":tprel_lo12:");
7006           break;
7007
7008         case SYMBOL_TLSLE24:
7009           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7010           break;
7011
7012         case SYMBOL_TINY_GOT:
7013           asm_fprintf (asm_out_file, ":got:");
7014           break;
7015
7016         case SYMBOL_TINY_TLSIE:
7017           asm_fprintf (asm_out_file, ":gottprel:");
7018           break;
7019
7020         default:
7021           break;
7022         }
7023       output_addr_const (asm_out_file, x);
7024       break;
7025
7026     case 'G':
7027       switch (aarch64_classify_symbolic_expression (x))
7028         {
7029         case SYMBOL_TLSLE24:
7030           asm_fprintf (asm_out_file, ":tprel_hi12:");
7031           break;
7032         default:
7033           break;
7034         }
7035       output_addr_const (asm_out_file, x);
7036       break;
7037
7038     case 'k':
7039       {
7040         HOST_WIDE_INT cond_code;
7041
7042         if (!CONST_INT_P (x))
7043           {
7044             output_operand_lossage ("invalid operand for '%%%c'", code);
7045             return;
7046           }
7047
7048         cond_code = INTVAL (x);
7049         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7050         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7051       }
7052       break;
7053
7054     case 'y':
7055     case 'z':
7056       {
7057         machine_mode mode = GET_MODE (x);
7058
7059         if (GET_CODE (x) != MEM
7060             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7061           {
7062             output_operand_lossage ("invalid operand for '%%%c'", code);
7063             return;
7064           }
7065
7066         if (code == 'y')
7067           /* LDP/STP which uses a single double-width memory operand.
7068              Adjust the mode to appear like a typical LDP/STP.
7069              Currently this is supported for 16-byte accesses only.  */
7070           mode = DFmode;
7071
7072         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7073           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7074       }
7075       break;
7076
7077     default:
7078       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7079       return;
7080     }
7081 }
7082
7083 /* Print address 'x' of a memory access with mode 'mode'.
7084    'op' is the context required by aarch64_classify_address.  It can either be
7085    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7086 static bool
7087 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7088                                 aarch64_addr_query_type type)
7089 {
7090   struct aarch64_address_info addr;
7091   unsigned int size;
7092
7093   /* Check all addresses are Pmode - including ILP32.  */
7094   if (GET_MODE (x) != Pmode)
7095     output_operand_lossage ("invalid address mode");
7096
7097   if (aarch64_classify_address (&addr, x, mode, true, type))
7098     switch (addr.type)
7099       {
7100       case ADDRESS_REG_IMM:
7101         if (known_eq (addr.const_offset, 0))
7102           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7103         else if (aarch64_sve_data_mode_p (mode))
7104           {
7105             HOST_WIDE_INT vnum
7106               = exact_div (addr.const_offset,
7107                            BYTES_PER_SVE_VECTOR).to_constant ();
7108             asm_fprintf (f, "[%s, #%wd, mul vl]",
7109                          reg_names[REGNO (addr.base)], vnum);
7110           }
7111         else if (aarch64_sve_pred_mode_p (mode))
7112           {
7113             HOST_WIDE_INT vnum
7114               = exact_div (addr.const_offset,
7115                            BYTES_PER_SVE_PRED).to_constant ();
7116             asm_fprintf (f, "[%s, #%wd, mul vl]",
7117                          reg_names[REGNO (addr.base)], vnum);
7118           }
7119         else
7120           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7121                        INTVAL (addr.offset));
7122         return true;
7123
7124       case ADDRESS_REG_REG:
7125         if (addr.shift == 0)
7126           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7127                        reg_names [REGNO (addr.offset)]);
7128         else
7129           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7130                        reg_names [REGNO (addr.offset)], addr.shift);
7131         return true;
7132
7133       case ADDRESS_REG_UXTW:
7134         if (addr.shift == 0)
7135           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7136                        REGNO (addr.offset) - R0_REGNUM);
7137         else
7138           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7139                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7140         return true;
7141
7142       case ADDRESS_REG_SXTW:
7143         if (addr.shift == 0)
7144           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7145                        REGNO (addr.offset) - R0_REGNUM);
7146         else
7147           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7148                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7149         return true;
7150
7151       case ADDRESS_REG_WB:
7152         /* Writeback is only supported for fixed-width modes.  */
7153         size = GET_MODE_SIZE (mode).to_constant ();
7154         switch (GET_CODE (x))
7155           {
7156           case PRE_INC:
7157             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7158             return true;
7159           case POST_INC:
7160             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7161             return true;
7162           case PRE_DEC:
7163             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7164             return true;
7165           case POST_DEC:
7166             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7167             return true;
7168           case PRE_MODIFY:
7169             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7170                          INTVAL (addr.offset));
7171             return true;
7172           case POST_MODIFY:
7173             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7174                          INTVAL (addr.offset));
7175             return true;
7176           default:
7177             break;
7178           }
7179         break;
7180
7181       case ADDRESS_LO_SUM:
7182         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7183         output_addr_const (f, addr.offset);
7184         asm_fprintf (f, "]");
7185         return true;
7186
7187       case ADDRESS_SYMBOLIC:
7188         output_addr_const (f, x);
7189         return true;
7190       }
7191
7192   return false;
7193 }
7194
7195 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7196 static bool
7197 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7198 {
7199   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7200 }
7201
7202 /* Print address 'x' of a memory access with mode 'mode'.  */
7203 static void
7204 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7205 {
7206   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7207     output_addr_const (f, x);
7208 }
7209
7210 bool
7211 aarch64_label_mentioned_p (rtx x)
7212 {
7213   const char *fmt;
7214   int i;
7215
7216   if (GET_CODE (x) == LABEL_REF)
7217     return true;
7218
7219   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7220      referencing instruction, but they are constant offsets, not
7221      symbols.  */
7222   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7223     return false;
7224
7225   fmt = GET_RTX_FORMAT (GET_CODE (x));
7226   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7227     {
7228       if (fmt[i] == 'E')
7229         {
7230           int j;
7231
7232           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7233             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7234               return 1;
7235         }
7236       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7237         return 1;
7238     }
7239
7240   return 0;
7241 }
7242
7243 /* Implement REGNO_REG_CLASS.  */
7244
7245 enum reg_class
7246 aarch64_regno_regclass (unsigned regno)
7247 {
7248   if (GP_REGNUM_P (regno))
7249     return GENERAL_REGS;
7250
7251   if (regno == SP_REGNUM)
7252     return STACK_REG;
7253
7254   if (regno == FRAME_POINTER_REGNUM
7255       || regno == ARG_POINTER_REGNUM)
7256     return POINTER_REGS;
7257
7258   if (FP_REGNUM_P (regno))
7259     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7260
7261   if (PR_REGNUM_P (regno))
7262     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7263
7264   return NO_REGS;
7265 }
7266
7267 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7268    If OFFSET is out of range, return an offset of an anchor point
7269    that is in range.  Return 0 otherwise.  */
7270
7271 static HOST_WIDE_INT
7272 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7273                        machine_mode mode)
7274 {
7275   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7276   if (size > 16)
7277     return (offset + 0x400) & ~0x7f0;
7278
7279   /* For offsets that aren't a multiple of the access size, the limit is
7280      -256...255.  */
7281   if (offset & (size - 1))
7282     {
7283       /* BLKmode typically uses LDP of X-registers.  */
7284       if (mode == BLKmode)
7285         return (offset + 512) & ~0x3ff;
7286       return (offset + 0x100) & ~0x1ff;
7287     }
7288
7289   /* Small negative offsets are supported.  */
7290   if (IN_RANGE (offset, -256, 0))
7291     return 0;
7292
7293   if (mode == TImode || mode == TFmode)
7294     return (offset + 0x100) & ~0x1ff;
7295
7296   /* Use 12-bit offset by access size.  */
7297   return offset & (~0xfff * size);
7298 }
7299
7300 static rtx
7301 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7302 {
7303   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7304      where mask is selected by alignment and size of the offset.
7305      We try to pick as large a range for the offset as possible to
7306      maximize the chance of a CSE.  However, for aligned addresses
7307      we limit the range to 4k so that structures with different sized
7308      elements are likely to use the same base.  We need to be careful
7309      not to split a CONST for some forms of address expression, otherwise
7310      it will generate sub-optimal code.  */
7311
7312   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7313     {
7314       rtx base = XEXP (x, 0);
7315       rtx offset_rtx = XEXP (x, 1);
7316       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7317
7318       if (GET_CODE (base) == PLUS)
7319         {
7320           rtx op0 = XEXP (base, 0);
7321           rtx op1 = XEXP (base, 1);
7322
7323           /* Force any scaling into a temp for CSE.  */
7324           op0 = force_reg (Pmode, op0);
7325           op1 = force_reg (Pmode, op1);
7326
7327           /* Let the pointer register be in op0.  */
7328           if (REG_POINTER (op1))
7329             std::swap (op0, op1);
7330
7331           /* If the pointer is virtual or frame related, then we know that
7332              virtual register instantiation or register elimination is going
7333              to apply a second constant.  We want the two constants folded
7334              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7335           if (virt_or_elim_regno_p (REGNO (op0)))
7336             {
7337               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7338                                    NULL_RTX, true, OPTAB_DIRECT);
7339               return gen_rtx_PLUS (Pmode, base, op1);
7340             }
7341
7342           /* Otherwise, in order to encourage CSE (and thence loop strength
7343              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7344           base = expand_binop (Pmode, add_optab, op0, op1,
7345                                NULL_RTX, true, OPTAB_DIRECT);
7346           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7347         }
7348
7349       HOST_WIDE_INT size;
7350       if (GET_MODE_SIZE (mode).is_constant (&size))
7351         {
7352           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7353                                                              mode);
7354           if (base_offset != 0)
7355             {
7356               base = plus_constant (Pmode, base, base_offset);
7357               base = force_operand (base, NULL_RTX);
7358               return plus_constant (Pmode, base, offset - base_offset);
7359             }
7360         }
7361     }
7362
7363   return x;
7364 }
7365
7366 /* Return the reload icode required for a constant pool in mode.  */
7367 static enum insn_code
7368 aarch64_constant_pool_reload_icode (machine_mode mode)
7369 {
7370   switch (mode)
7371     {
7372     case E_SFmode:
7373       return CODE_FOR_aarch64_reload_movcpsfdi;
7374
7375     case E_DFmode:
7376       return CODE_FOR_aarch64_reload_movcpdfdi;
7377
7378     case E_TFmode:
7379       return CODE_FOR_aarch64_reload_movcptfdi;
7380
7381     case E_V8QImode:
7382       return CODE_FOR_aarch64_reload_movcpv8qidi;
7383
7384     case E_V16QImode:
7385       return CODE_FOR_aarch64_reload_movcpv16qidi;
7386
7387     case E_V4HImode:
7388       return CODE_FOR_aarch64_reload_movcpv4hidi;
7389
7390     case E_V8HImode:
7391       return CODE_FOR_aarch64_reload_movcpv8hidi;
7392
7393     case E_V2SImode:
7394       return CODE_FOR_aarch64_reload_movcpv2sidi;
7395
7396     case E_V4SImode:
7397       return CODE_FOR_aarch64_reload_movcpv4sidi;
7398
7399     case E_V2DImode:
7400       return CODE_FOR_aarch64_reload_movcpv2didi;
7401
7402     case E_V2DFmode:
7403       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7404
7405     default:
7406       gcc_unreachable ();
7407     }
7408
7409   gcc_unreachable ();
7410 }
7411 static reg_class_t
7412 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7413                           reg_class_t rclass,
7414                           machine_mode mode,
7415                           secondary_reload_info *sri)
7416 {
7417   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7418      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7419      comment at the head of aarch64-sve.md for more details about the
7420      big-endian handling.  */
7421   if (BYTES_BIG_ENDIAN
7422       && reg_class_subset_p (rclass, FP_REGS)
7423       && !((REG_P (x) && HARD_REGISTER_P (x))
7424            || aarch64_simd_valid_immediate (x, NULL))
7425       && aarch64_sve_data_mode_p (mode))
7426     {
7427       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7428       return NO_REGS;
7429     }
7430
7431   /* If we have to disable direct literal pool loads and stores because the
7432      function is too big, then we need a scratch register.  */
7433   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7434       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7435           || targetm.vector_mode_supported_p (GET_MODE (x)))
7436       && !aarch64_pcrelative_literal_loads)
7437     {
7438       sri->icode = aarch64_constant_pool_reload_icode (mode);
7439       return NO_REGS;
7440     }
7441
7442   /* Without the TARGET_SIMD instructions we cannot move a Q register
7443      to a Q register directly.  We need a scratch.  */
7444   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7445       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7446       && reg_class_subset_p (rclass, FP_REGS))
7447     {
7448       if (mode == TFmode)
7449         sri->icode = CODE_FOR_aarch64_reload_movtf;
7450       else if (mode == TImode)
7451         sri->icode = CODE_FOR_aarch64_reload_movti;
7452       return NO_REGS;
7453     }
7454
7455   /* A TFmode or TImode memory access should be handled via an FP_REGS
7456      because AArch64 has richer addressing modes for LDR/STR instructions
7457      than LDP/STP instructions.  */
7458   if (TARGET_FLOAT && rclass == GENERAL_REGS
7459       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7460     return FP_REGS;
7461
7462   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7463       return GENERAL_REGS;
7464
7465   return NO_REGS;
7466 }
7467
7468 static bool
7469 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7470 {
7471   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7472
7473   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7474      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7475   if (frame_pointer_needed)
7476     return to == HARD_FRAME_POINTER_REGNUM;
7477   return true;
7478 }
7479
7480 poly_int64
7481 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7482 {
7483   aarch64_layout_frame ();
7484
7485   if (to == HARD_FRAME_POINTER_REGNUM)
7486     {
7487       if (from == ARG_POINTER_REGNUM)
7488         return cfun->machine->frame.hard_fp_offset;
7489
7490       if (from == FRAME_POINTER_REGNUM)
7491         return cfun->machine->frame.hard_fp_offset
7492                - cfun->machine->frame.locals_offset;
7493     }
7494
7495   if (to == STACK_POINTER_REGNUM)
7496     {
7497       if (from == FRAME_POINTER_REGNUM)
7498           return cfun->machine->frame.frame_size
7499                  - cfun->machine->frame.locals_offset;
7500     }
7501
7502   return cfun->machine->frame.frame_size;
7503 }
7504
7505 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7506    previous frame.  */
7507
7508 rtx
7509 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7510 {
7511   if (count != 0)
7512     return const0_rtx;
7513   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7514 }
7515
7516
7517 static void
7518 aarch64_asm_trampoline_template (FILE *f)
7519 {
7520   if (TARGET_ILP32)
7521     {
7522       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7523       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7524     }
7525   else
7526     {
7527       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7528       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7529     }
7530   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7531   assemble_aligned_integer (4, const0_rtx);
7532   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7533   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7534 }
7535
7536 static void
7537 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7538 {
7539   rtx fnaddr, mem, a_tramp;
7540   const int tramp_code_sz = 16;
7541
7542   /* Don't need to copy the trailing D-words, we fill those in below.  */
7543   emit_block_move (m_tramp, assemble_trampoline_template (),
7544                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7545   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7546   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7547   if (GET_MODE (fnaddr) != ptr_mode)
7548     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7549   emit_move_insn (mem, fnaddr);
7550
7551   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7552   emit_move_insn (mem, chain_value);
7553
7554   /* XXX We should really define a "clear_cache" pattern and use
7555      gen_clear_cache().  */
7556   a_tramp = XEXP (m_tramp, 0);
7557   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7558                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7559                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7560                      ptr_mode);
7561 }
7562
7563 static unsigned char
7564 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7565 {
7566   /* ??? Logically we should only need to provide a value when
7567      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7568      can hold MODE, but at the moment we need to handle all modes.
7569      Just ignore any runtime parts for registers that can't store them.  */
7570   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7571   unsigned int nregs;
7572   switch (regclass)
7573     {
7574     case TAILCALL_ADDR_REGS:
7575     case POINTER_REGS:
7576     case GENERAL_REGS:
7577     case ALL_REGS:
7578     case POINTER_AND_FP_REGS:
7579     case FP_REGS:
7580     case FP_LO_REGS:
7581       if (aarch64_sve_data_mode_p (mode)
7582           && constant_multiple_p (GET_MODE_SIZE (mode),
7583                                   BYTES_PER_SVE_VECTOR, &nregs))
7584         return nregs;
7585       return (aarch64_vector_data_mode_p (mode)
7586               ? CEIL (lowest_size, UNITS_PER_VREG)
7587               : CEIL (lowest_size, UNITS_PER_WORD));
7588     case STACK_REG:
7589     case PR_REGS:
7590     case PR_LO_REGS:
7591     case PR_HI_REGS:
7592       return 1;
7593
7594     case NO_REGS:
7595       return 0;
7596
7597     default:
7598       break;
7599     }
7600   gcc_unreachable ();
7601 }
7602
7603 static reg_class_t
7604 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7605 {
7606   if (regclass == POINTER_REGS)
7607     return GENERAL_REGS;
7608
7609   if (regclass == STACK_REG)
7610     {
7611       if (REG_P(x)
7612           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7613           return regclass;
7614
7615       return NO_REGS;
7616     }
7617
7618   /* Register eliminiation can result in a request for
7619      SP+constant->FP_REGS.  We cannot support such operations which
7620      use SP as source and an FP_REG as destination, so reject out
7621      right now.  */
7622   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7623     {
7624       rtx lhs = XEXP (x, 0);
7625
7626       /* Look through a possible SUBREG introduced by ILP32.  */
7627       if (GET_CODE (lhs) == SUBREG)
7628         lhs = SUBREG_REG (lhs);
7629
7630       gcc_assert (REG_P (lhs));
7631       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7632                                       POINTER_REGS));
7633       return NO_REGS;
7634     }
7635
7636   return regclass;
7637 }
7638
7639 void
7640 aarch64_asm_output_labelref (FILE* f, const char *name)
7641 {
7642   asm_fprintf (f, "%U%s", name);
7643 }
7644
7645 static void
7646 aarch64_elf_asm_constructor (rtx symbol, int priority)
7647 {
7648   if (priority == DEFAULT_INIT_PRIORITY)
7649     default_ctor_section_asm_out_constructor (symbol, priority);
7650   else
7651     {
7652       section *s;
7653       /* While priority is known to be in range [0, 65535], so 18 bytes
7654          would be enough, the compiler might not know that.  To avoid
7655          -Wformat-truncation false positive, use a larger size.  */
7656       char buf[23];
7657       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7658       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7659       switch_to_section (s);
7660       assemble_align (POINTER_SIZE);
7661       assemble_aligned_integer (POINTER_BYTES, symbol);
7662     }
7663 }
7664
7665 static void
7666 aarch64_elf_asm_destructor (rtx symbol, int priority)
7667 {
7668   if (priority == DEFAULT_INIT_PRIORITY)
7669     default_dtor_section_asm_out_destructor (symbol, priority);
7670   else
7671     {
7672       section *s;
7673       /* While priority is known to be in range [0, 65535], so 18 bytes
7674          would be enough, the compiler might not know that.  To avoid
7675          -Wformat-truncation false positive, use a larger size.  */
7676       char buf[23];
7677       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7678       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7679       switch_to_section (s);
7680       assemble_align (POINTER_SIZE);
7681       assemble_aligned_integer (POINTER_BYTES, symbol);
7682     }
7683 }
7684
7685 const char*
7686 aarch64_output_casesi (rtx *operands)
7687 {
7688   char buf[100];
7689   char label[100];
7690   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7691   int index;
7692   static const char *const patterns[4][2] =
7693   {
7694     {
7695       "ldrb\t%w3, [%0,%w1,uxtw]",
7696       "add\t%3, %4, %w3, sxtb #2"
7697     },
7698     {
7699       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7700       "add\t%3, %4, %w3, sxth #2"
7701     },
7702     {
7703       "ldr\t%w3, [%0,%w1,uxtw #2]",
7704       "add\t%3, %4, %w3, sxtw #2"
7705     },
7706     /* We assume that DImode is only generated when not optimizing and
7707        that we don't really need 64-bit address offsets.  That would
7708        imply an object file with 8GB of code in a single function!  */
7709     {
7710       "ldr\t%w3, [%0,%w1,uxtw #2]",
7711       "add\t%3, %4, %w3, sxtw #2"
7712     }
7713   };
7714
7715   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7716
7717   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7718   index = exact_log2 (GET_MODE_SIZE (mode));
7719
7720   gcc_assert (index >= 0 && index <= 3);
7721
7722   /* Need to implement table size reduction, by chaning the code below.  */
7723   output_asm_insn (patterns[index][0], operands);
7724   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7725   snprintf (buf, sizeof (buf),
7726             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7727   output_asm_insn (buf, operands);
7728   output_asm_insn (patterns[index][1], operands);
7729   output_asm_insn ("br\t%3", operands);
7730   assemble_label (asm_out_file, label);
7731   return "";
7732 }
7733
7734
7735 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7736    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7737    operator.  */
7738
7739 int
7740 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7741 {
7742   if (shift >= 0 && shift <= 3)
7743     {
7744       int size;
7745       for (size = 8; size <= 32; size *= 2)
7746         {
7747           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7748           if (mask == bits << shift)
7749             return size;
7750         }
7751     }
7752   return 0;
7753 }
7754
7755 /* Constant pools are per function only when PC relative
7756    literal loads are true or we are in the large memory
7757    model.  */
7758
7759 static inline bool
7760 aarch64_can_use_per_function_literal_pools_p (void)
7761 {
7762   return (aarch64_pcrelative_literal_loads
7763           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7764 }
7765
7766 static bool
7767 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7768 {
7769   /* We can't use blocks for constants when we're using a per-function
7770      constant pool.  */
7771   return !aarch64_can_use_per_function_literal_pools_p ();
7772 }
7773
7774 /* Select appropriate section for constants depending
7775    on where we place literal pools.  */
7776
7777 static section *
7778 aarch64_select_rtx_section (machine_mode mode,
7779                             rtx x,
7780                             unsigned HOST_WIDE_INT align)
7781 {
7782   if (aarch64_can_use_per_function_literal_pools_p ())
7783     return function_section (current_function_decl);
7784
7785   return default_elf_select_rtx_section (mode, x, align);
7786 }
7787
7788 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7789 void
7790 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7791                                   HOST_WIDE_INT offset)
7792 {
7793   /* When using per-function literal pools, we must ensure that any code
7794      section is aligned to the minimal instruction length, lest we get
7795      errors from the assembler re "unaligned instructions".  */
7796   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7797     ASM_OUTPUT_ALIGN (f, 2);
7798 }
7799
7800 /* Costs.  */
7801
7802 /* Helper function for rtx cost calculation.  Strip a shift expression
7803    from X.  Returns the inner operand if successful, or the original
7804    expression on failure.  */
7805 static rtx
7806 aarch64_strip_shift (rtx x)
7807 {
7808   rtx op = x;
7809
7810   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7811      we can convert both to ROR during final output.  */
7812   if ((GET_CODE (op) == ASHIFT
7813        || GET_CODE (op) == ASHIFTRT
7814        || GET_CODE (op) == LSHIFTRT
7815        || GET_CODE (op) == ROTATERT
7816        || GET_CODE (op) == ROTATE)
7817       && CONST_INT_P (XEXP (op, 1)))
7818     return XEXP (op, 0);
7819
7820   if (GET_CODE (op) == MULT
7821       && CONST_INT_P (XEXP (op, 1))
7822       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7823     return XEXP (op, 0);
7824
7825   return x;
7826 }
7827
7828 /* Helper function for rtx cost calculation.  Strip an extend
7829    expression from X.  Returns the inner operand if successful, or the
7830    original expression on failure.  We deal with a number of possible
7831    canonicalization variations here. If STRIP_SHIFT is true, then
7832    we can strip off a shift also.  */
7833 static rtx
7834 aarch64_strip_extend (rtx x, bool strip_shift)
7835 {
7836   scalar_int_mode mode;
7837   rtx op = x;
7838
7839   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7840     return op;
7841
7842   /* Zero and sign extraction of a widened value.  */
7843   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7844       && XEXP (op, 2) == const0_rtx
7845       && GET_CODE (XEXP (op, 0)) == MULT
7846       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7847                                          XEXP (op, 1)))
7848     return XEXP (XEXP (op, 0), 0);
7849
7850   /* It can also be represented (for zero-extend) as an AND with an
7851      immediate.  */
7852   if (GET_CODE (op) == AND
7853       && GET_CODE (XEXP (op, 0)) == MULT
7854       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7855       && CONST_INT_P (XEXP (op, 1))
7856       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7857                            INTVAL (XEXP (op, 1))) != 0)
7858     return XEXP (XEXP (op, 0), 0);
7859
7860   /* Now handle extended register, as this may also have an optional
7861      left shift by 1..4.  */
7862   if (strip_shift
7863       && GET_CODE (op) == ASHIFT
7864       && CONST_INT_P (XEXP (op, 1))
7865       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7866     op = XEXP (op, 0);
7867
7868   if (GET_CODE (op) == ZERO_EXTEND
7869       || GET_CODE (op) == SIGN_EXTEND)
7870     op = XEXP (op, 0);
7871
7872   if (op != x)
7873     return op;
7874
7875   return x;
7876 }
7877
7878 /* Return true iff CODE is a shift supported in combination
7879    with arithmetic instructions.  */
7880
7881 static bool
7882 aarch64_shift_p (enum rtx_code code)
7883 {
7884   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7885 }
7886
7887
7888 /* Return true iff X is a cheap shift without a sign extend. */
7889
7890 static bool
7891 aarch64_cheap_mult_shift_p (rtx x)
7892 {
7893   rtx op0, op1;
7894
7895   op0 = XEXP (x, 0);
7896   op1 = XEXP (x, 1);
7897
7898   if (!(aarch64_tune_params.extra_tuning_flags
7899                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7900     return false;
7901
7902   if (GET_CODE (op0) == SIGN_EXTEND)
7903     return false;
7904
7905   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7906       && UINTVAL (op1) <= 4)
7907     return true;
7908
7909   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7910     return false;
7911
7912   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7913
7914   if (l2 > 0 && l2 <= 4)
7915     return true;
7916
7917   return false;
7918 }
7919
7920 /* Helper function for rtx cost calculation.  Calculate the cost of
7921    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7922    Return the calculated cost of the expression, recursing manually in to
7923    operands where needed.  */
7924
7925 static int
7926 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7927 {
7928   rtx op0, op1;
7929   const struct cpu_cost_table *extra_cost
7930     = aarch64_tune_params.insn_extra_cost;
7931   int cost = 0;
7932   bool compound_p = (outer == PLUS || outer == MINUS);
7933   machine_mode mode = GET_MODE (x);
7934
7935   gcc_checking_assert (code == MULT);
7936
7937   op0 = XEXP (x, 0);
7938   op1 = XEXP (x, 1);
7939
7940   if (VECTOR_MODE_P (mode))
7941     mode = GET_MODE_INNER (mode);
7942
7943   /* Integer multiply/fma.  */
7944   if (GET_MODE_CLASS (mode) == MODE_INT)
7945     {
7946       /* The multiply will be canonicalized as a shift, cost it as such.  */
7947       if (aarch64_shift_p (GET_CODE (x))
7948           || (CONST_INT_P (op1)
7949               && exact_log2 (INTVAL (op1)) > 0))
7950         {
7951           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7952                            || GET_CODE (op0) == SIGN_EXTEND;
7953           if (speed)
7954             {
7955               if (compound_p)
7956                 {
7957                   /* If the shift is considered cheap,
7958                      then don't add any cost. */
7959                   if (aarch64_cheap_mult_shift_p (x))
7960                     ;
7961                   else if (REG_P (op1))
7962                     /* ARITH + shift-by-register.  */
7963                     cost += extra_cost->alu.arith_shift_reg;
7964                   else if (is_extend)
7965                     /* ARITH + extended register.  We don't have a cost field
7966                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7967                     cost += extra_cost->alu.extend_arith;
7968                   else
7969                     /* ARITH + shift-by-immediate.  */
7970                     cost += extra_cost->alu.arith_shift;
7971                 }
7972               else
7973                 /* LSL (immediate).  */
7974                 cost += extra_cost->alu.shift;
7975
7976             }
7977           /* Strip extends as we will have costed them in the case above.  */
7978           if (is_extend)
7979             op0 = aarch64_strip_extend (op0, true);
7980
7981           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7982
7983           return cost;
7984         }
7985
7986       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7987          compound and let the below cases handle it.  After all, MNEG is a
7988          special-case alias of MSUB.  */
7989       if (GET_CODE (op0) == NEG)
7990         {
7991           op0 = XEXP (op0, 0);
7992           compound_p = true;
7993         }
7994
7995       /* Integer multiplies or FMAs have zero/sign extending variants.  */
7996       if ((GET_CODE (op0) == ZERO_EXTEND
7997            && GET_CODE (op1) == ZERO_EXTEND)
7998           || (GET_CODE (op0) == SIGN_EXTEND
7999               && GET_CODE (op1) == SIGN_EXTEND))
8000         {
8001           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8002           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8003
8004           if (speed)
8005             {
8006               if (compound_p)
8007                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8008                 cost += extra_cost->mult[0].extend_add;
8009               else
8010                 /* MUL/SMULL/UMULL.  */
8011                 cost += extra_cost->mult[0].extend;
8012             }
8013
8014           return cost;
8015         }
8016
8017       /* This is either an integer multiply or a MADD.  In both cases
8018          we want to recurse and cost the operands.  */
8019       cost += rtx_cost (op0, mode, MULT, 0, speed);
8020       cost += rtx_cost (op1, mode, MULT, 1, speed);
8021
8022       if (speed)
8023         {
8024           if (compound_p)
8025             /* MADD/MSUB.  */
8026             cost += extra_cost->mult[mode == DImode].add;
8027           else
8028             /* MUL.  */
8029             cost += extra_cost->mult[mode == DImode].simple;
8030         }
8031
8032       return cost;
8033     }
8034   else
8035     {
8036       if (speed)
8037         {
8038           /* Floating-point FMA/FMUL can also support negations of the
8039              operands, unless the rounding mode is upward or downward in
8040              which case FNMUL is different than FMUL with operand negation.  */
8041           bool neg0 = GET_CODE (op0) == NEG;
8042           bool neg1 = GET_CODE (op1) == NEG;
8043           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8044             {
8045               if (neg0)
8046                 op0 = XEXP (op0, 0);
8047               if (neg1)
8048                 op1 = XEXP (op1, 0);
8049             }
8050
8051           if (compound_p)
8052             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8053             cost += extra_cost->fp[mode == DFmode].fma;
8054           else
8055             /* FMUL/FNMUL.  */
8056             cost += extra_cost->fp[mode == DFmode].mult;
8057         }
8058
8059       cost += rtx_cost (op0, mode, MULT, 0, speed);
8060       cost += rtx_cost (op1, mode, MULT, 1, speed);
8061       return cost;
8062     }
8063 }
8064
8065 static int
8066 aarch64_address_cost (rtx x,
8067                       machine_mode mode,
8068                       addr_space_t as ATTRIBUTE_UNUSED,
8069                       bool speed)
8070 {
8071   enum rtx_code c = GET_CODE (x);
8072   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8073   struct aarch64_address_info info;
8074   int cost = 0;
8075   info.shift = 0;
8076
8077   if (!aarch64_classify_address (&info, x, mode, false))
8078     {
8079       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8080         {
8081           /* This is a CONST or SYMBOL ref which will be split
8082              in a different way depending on the code model in use.
8083              Cost it through the generic infrastructure.  */
8084           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8085           /* Divide through by the cost of one instruction to
8086              bring it to the same units as the address costs.  */
8087           cost_symbol_ref /= COSTS_N_INSNS (1);
8088           /* The cost is then the cost of preparing the address,
8089              followed by an immediate (possibly 0) offset.  */
8090           return cost_symbol_ref + addr_cost->imm_offset;
8091         }
8092       else
8093         {
8094           /* This is most likely a jump table from a case
8095              statement.  */
8096           return addr_cost->register_offset;
8097         }
8098     }
8099
8100   switch (info.type)
8101     {
8102       case ADDRESS_LO_SUM:
8103       case ADDRESS_SYMBOLIC:
8104       case ADDRESS_REG_IMM:
8105         cost += addr_cost->imm_offset;
8106         break;
8107
8108       case ADDRESS_REG_WB:
8109         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8110           cost += addr_cost->pre_modify;
8111         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8112           cost += addr_cost->post_modify;
8113         else
8114           gcc_unreachable ();
8115
8116         break;
8117
8118       case ADDRESS_REG_REG:
8119         cost += addr_cost->register_offset;
8120         break;
8121
8122       case ADDRESS_REG_SXTW:
8123         cost += addr_cost->register_sextend;
8124         break;
8125
8126       case ADDRESS_REG_UXTW:
8127         cost += addr_cost->register_zextend;
8128         break;
8129
8130       default:
8131         gcc_unreachable ();
8132     }
8133
8134
8135   if (info.shift > 0)
8136     {
8137       /* For the sake of calculating the cost of the shifted register
8138          component, we can treat same sized modes in the same way.  */
8139       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8140         cost += addr_cost->addr_scale_costs.hi;
8141       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8142         cost += addr_cost->addr_scale_costs.si;
8143       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8144         cost += addr_cost->addr_scale_costs.di;
8145       else
8146         /* We can't tell, or this is a 128-bit vector.  */
8147         cost += addr_cost->addr_scale_costs.ti;
8148     }
8149
8150   return cost;
8151 }
8152
8153 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8154    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8155    to be taken.  */
8156
8157 int
8158 aarch64_branch_cost (bool speed_p, bool predictable_p)
8159 {
8160   /* When optimizing for speed, use the cost of unpredictable branches.  */
8161   const struct cpu_branch_cost *branch_costs =
8162     aarch64_tune_params.branch_costs;
8163
8164   if (!speed_p || predictable_p)
8165     return branch_costs->predictable;
8166   else
8167     return branch_costs->unpredictable;
8168 }
8169
8170 /* Return true if the RTX X in mode MODE is a zero or sign extract
8171    usable in an ADD or SUB (extended register) instruction.  */
8172 static bool
8173 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8174 {
8175   /* Catch add with a sign extract.
8176      This is add_<optab><mode>_multp2.  */
8177   if (GET_CODE (x) == SIGN_EXTRACT
8178       || GET_CODE (x) == ZERO_EXTRACT)
8179     {
8180       rtx op0 = XEXP (x, 0);
8181       rtx op1 = XEXP (x, 1);
8182       rtx op2 = XEXP (x, 2);
8183
8184       if (GET_CODE (op0) == MULT
8185           && CONST_INT_P (op1)
8186           && op2 == const0_rtx
8187           && CONST_INT_P (XEXP (op0, 1))
8188           && aarch64_is_extend_from_extract (mode,
8189                                              XEXP (op0, 1),
8190                                              op1))
8191         {
8192           return true;
8193         }
8194     }
8195   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8196      No shift.  */
8197   else if (GET_CODE (x) == SIGN_EXTEND
8198            || GET_CODE (x) == ZERO_EXTEND)
8199     return REG_P (XEXP (x, 0));
8200
8201   return false;
8202 }
8203
8204 static bool
8205 aarch64_frint_unspec_p (unsigned int u)
8206 {
8207   switch (u)
8208     {
8209       case UNSPEC_FRINTZ:
8210       case UNSPEC_FRINTP:
8211       case UNSPEC_FRINTM:
8212       case UNSPEC_FRINTA:
8213       case UNSPEC_FRINTN:
8214       case UNSPEC_FRINTX:
8215       case UNSPEC_FRINTI:
8216         return true;
8217
8218       default:
8219         return false;
8220     }
8221 }
8222
8223 /* Return true iff X is an rtx that will match an extr instruction
8224    i.e. as described in the *extr<mode>5_insn family of patterns.
8225    OP0 and OP1 will be set to the operands of the shifts involved
8226    on success and will be NULL_RTX otherwise.  */
8227
8228 static bool
8229 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8230 {
8231   rtx op0, op1;
8232   scalar_int_mode mode;
8233   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8234     return false;
8235
8236   *res_op0 = NULL_RTX;
8237   *res_op1 = NULL_RTX;
8238
8239   if (GET_CODE (x) != IOR)
8240     return false;
8241
8242   op0 = XEXP (x, 0);
8243   op1 = XEXP (x, 1);
8244
8245   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8246       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8247     {
8248      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8249       if (GET_CODE (op1) == ASHIFT)
8250         std::swap (op0, op1);
8251
8252       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8253         return false;
8254
8255       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8256       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8257
8258       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8259           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8260         {
8261           *res_op0 = XEXP (op0, 0);
8262           *res_op1 = XEXP (op1, 0);
8263           return true;
8264         }
8265     }
8266
8267   return false;
8268 }
8269
8270 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8271    storing it in *COST.  Result is true if the total cost of the operation
8272    has now been calculated.  */
8273 static bool
8274 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8275 {
8276   rtx inner;
8277   rtx comparator;
8278   enum rtx_code cmpcode;
8279
8280   if (COMPARISON_P (op0))
8281     {
8282       inner = XEXP (op0, 0);
8283       comparator = XEXP (op0, 1);
8284       cmpcode = GET_CODE (op0);
8285     }
8286   else
8287     {
8288       inner = op0;
8289       comparator = const0_rtx;
8290       cmpcode = NE;
8291     }
8292
8293   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8294     {
8295       /* Conditional branch.  */
8296       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8297         return true;
8298       else
8299         {
8300           if (cmpcode == NE || cmpcode == EQ)
8301             {
8302               if (comparator == const0_rtx)
8303                 {
8304                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8305                   if (GET_CODE (inner) == ZERO_EXTRACT)
8306                     /* TBZ/TBNZ.  */
8307                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8308                                        ZERO_EXTRACT, 0, speed);
8309                   else
8310                     /* CBZ/CBNZ.  */
8311                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8312
8313                 return true;
8314               }
8315             }
8316           else if (cmpcode == LT || cmpcode == GE)
8317             {
8318               /* TBZ/TBNZ.  */
8319               if (comparator == const0_rtx)
8320                 return true;
8321             }
8322         }
8323     }
8324   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8325     {
8326       /* CCMP.  */
8327       if (GET_CODE (op1) == COMPARE)
8328         {
8329           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8330           if (XEXP (op1, 1) == const0_rtx)
8331             *cost += 1;
8332           if (speed)
8333             {
8334               machine_mode mode = GET_MODE (XEXP (op1, 0));
8335               const struct cpu_cost_table *extra_cost
8336                 = aarch64_tune_params.insn_extra_cost;
8337
8338               if (GET_MODE_CLASS (mode) == MODE_INT)
8339                 *cost += extra_cost->alu.arith;
8340               else
8341                 *cost += extra_cost->fp[mode == DFmode].compare;
8342             }
8343           return true;
8344         }
8345
8346       /* It's a conditional operation based on the status flags,
8347          so it must be some flavor of CSEL.  */
8348
8349       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8350       if (GET_CODE (op1) == NEG
8351           || GET_CODE (op1) == NOT
8352           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8353         op1 = XEXP (op1, 0);
8354       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8355         {
8356           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8357           op1 = XEXP (op1, 0);
8358           op2 = XEXP (op2, 0);
8359         }
8360
8361       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8362       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8363       return true;
8364     }
8365
8366   /* We don't know what this is, cost all operands.  */
8367   return false;
8368 }
8369
8370 /* Check whether X is a bitfield operation of the form shift + extend that
8371    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8372    operand to which the bitfield operation is applied.  Otherwise return
8373    NULL_RTX.  */
8374
8375 static rtx
8376 aarch64_extend_bitfield_pattern_p (rtx x)
8377 {
8378   rtx_code outer_code = GET_CODE (x);
8379   machine_mode outer_mode = GET_MODE (x);
8380
8381   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8382       && outer_mode != SImode && outer_mode != DImode)
8383     return NULL_RTX;
8384
8385   rtx inner = XEXP (x, 0);
8386   rtx_code inner_code = GET_CODE (inner);
8387   machine_mode inner_mode = GET_MODE (inner);
8388   rtx op = NULL_RTX;
8389
8390   switch (inner_code)
8391     {
8392       case ASHIFT:
8393         if (CONST_INT_P (XEXP (inner, 1))
8394             && (inner_mode == QImode || inner_mode == HImode))
8395           op = XEXP (inner, 0);
8396         break;
8397       case LSHIFTRT:
8398         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8399             && (inner_mode == QImode || inner_mode == HImode))
8400           op = XEXP (inner, 0);
8401         break;
8402       case ASHIFTRT:
8403         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8404             && (inner_mode == QImode || inner_mode == HImode))
8405           op = XEXP (inner, 0);
8406         break;
8407       default:
8408         break;
8409     }
8410
8411   return op;
8412 }
8413
8414 /* Return true if the mask and a shift amount from an RTX of the form
8415    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8416    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8417
8418 bool
8419 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8420                                     rtx shft_amnt)
8421 {
8422   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8423          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8424          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8425          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8426 }
8427
8428 /* Calculate the cost of calculating X, storing it in *COST.  Result
8429    is true if the total cost of the operation has now been calculated.  */
8430 static bool
8431 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8432                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8433 {
8434   rtx op0, op1, op2;
8435   const struct cpu_cost_table *extra_cost
8436     = aarch64_tune_params.insn_extra_cost;
8437   int code = GET_CODE (x);
8438   scalar_int_mode int_mode;
8439
8440   /* By default, assume that everything has equivalent cost to the
8441      cheapest instruction.  Any additional costs are applied as a delta
8442      above this default.  */
8443   *cost = COSTS_N_INSNS (1);
8444
8445   switch (code)
8446     {
8447     case SET:
8448       /* The cost depends entirely on the operands to SET.  */
8449       *cost = 0;
8450       op0 = SET_DEST (x);
8451       op1 = SET_SRC (x);
8452
8453       switch (GET_CODE (op0))
8454         {
8455         case MEM:
8456           if (speed)
8457             {
8458               rtx address = XEXP (op0, 0);
8459               if (VECTOR_MODE_P (mode))
8460                 *cost += extra_cost->ldst.storev;
8461               else if (GET_MODE_CLASS (mode) == MODE_INT)
8462                 *cost += extra_cost->ldst.store;
8463               else if (mode == SFmode)
8464                 *cost += extra_cost->ldst.storef;
8465               else if (mode == DFmode)
8466                 *cost += extra_cost->ldst.stored;
8467
8468               *cost +=
8469                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8470                                                      0, speed));
8471             }
8472
8473           *cost += rtx_cost (op1, mode, SET, 1, speed);
8474           return true;
8475
8476         case SUBREG:
8477           if (! REG_P (SUBREG_REG (op0)))
8478             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8479
8480           /* Fall through.  */
8481         case REG:
8482           /* The cost is one per vector-register copied.  */
8483           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8484             {
8485               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8486               *cost = COSTS_N_INSNS (nregs);
8487             }
8488           /* const0_rtx is in general free, but we will use an
8489              instruction to set a register to 0.  */
8490           else if (REG_P (op1) || op1 == const0_rtx)
8491             {
8492               /* The cost is 1 per register copied.  */
8493               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8494               *cost = COSTS_N_INSNS (nregs);
8495             }
8496           else
8497             /* Cost is just the cost of the RHS of the set.  */
8498             *cost += rtx_cost (op1, mode, SET, 1, speed);
8499           return true;
8500
8501         case ZERO_EXTRACT:
8502         case SIGN_EXTRACT:
8503           /* Bit-field insertion.  Strip any redundant widening of
8504              the RHS to meet the width of the target.  */
8505           if (GET_CODE (op1) == SUBREG)
8506             op1 = SUBREG_REG (op1);
8507           if ((GET_CODE (op1) == ZERO_EXTEND
8508                || GET_CODE (op1) == SIGN_EXTEND)
8509               && CONST_INT_P (XEXP (op0, 1))
8510               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8511               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8512             op1 = XEXP (op1, 0);
8513
8514           if (CONST_INT_P (op1))
8515             {
8516               /* MOV immediate is assumed to always be cheap.  */
8517               *cost = COSTS_N_INSNS (1);
8518             }
8519           else
8520             {
8521               /* BFM.  */
8522               if (speed)
8523                 *cost += extra_cost->alu.bfi;
8524               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8525             }
8526
8527           return true;
8528
8529         default:
8530           /* We can't make sense of this, assume default cost.  */
8531           *cost = COSTS_N_INSNS (1);
8532           return false;
8533         }
8534       return false;
8535
8536     case CONST_INT:
8537       /* If an instruction can incorporate a constant within the
8538          instruction, the instruction's expression avoids calling
8539          rtx_cost() on the constant.  If rtx_cost() is called on a
8540          constant, then it is usually because the constant must be
8541          moved into a register by one or more instructions.
8542
8543          The exception is constant 0, which can be expressed
8544          as XZR/WZR and is therefore free.  The exception to this is
8545          if we have (set (reg) (const0_rtx)) in which case we must cost
8546          the move.  However, we can catch that when we cost the SET, so
8547          we don't need to consider that here.  */
8548       if (x == const0_rtx)
8549         *cost = 0;
8550       else
8551         {
8552           /* To an approximation, building any other constant is
8553              proportionally expensive to the number of instructions
8554              required to build that constant.  This is true whether we
8555              are compiling for SPEED or otherwise.  */
8556           if (!is_a <scalar_int_mode> (mode, &int_mode))
8557             int_mode = word_mode;
8558           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8559                                  (NULL_RTX, x, false, int_mode));
8560         }
8561       return true;
8562
8563     case CONST_DOUBLE:
8564
8565       /* First determine number of instructions to do the move
8566           as an integer constant.  */
8567       if (!aarch64_float_const_representable_p (x)
8568            && !aarch64_can_const_movi_rtx_p (x, mode)
8569            && aarch64_float_const_rtx_p (x))
8570         {
8571           unsigned HOST_WIDE_INT ival;
8572           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8573           gcc_assert (succeed);
8574
8575           scalar_int_mode imode = (mode == HFmode
8576                                    ? SImode
8577                                    : int_mode_for_mode (mode).require ());
8578           int ncost = aarch64_internal_mov_immediate
8579                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8580           *cost += COSTS_N_INSNS (ncost);
8581           return true;
8582         }
8583
8584       if (speed)
8585         {
8586           /* mov[df,sf]_aarch64.  */
8587           if (aarch64_float_const_representable_p (x))
8588             /* FMOV (scalar immediate).  */
8589             *cost += extra_cost->fp[mode == DFmode].fpconst;
8590           else if (!aarch64_float_const_zero_rtx_p (x))
8591             {
8592               /* This will be a load from memory.  */
8593               if (mode == DFmode)
8594                 *cost += extra_cost->ldst.loadd;
8595               else
8596                 *cost += extra_cost->ldst.loadf;
8597             }
8598           else
8599             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8600                or MOV v0.s[0], wzr - neither of which are modeled by the
8601                cost tables.  Just use the default cost.  */
8602             {
8603             }
8604         }
8605
8606       return true;
8607
8608     case MEM:
8609       if (speed)
8610         {
8611           /* For loads we want the base cost of a load, plus an
8612              approximation for the additional cost of the addressing
8613              mode.  */
8614           rtx address = XEXP (x, 0);
8615           if (VECTOR_MODE_P (mode))
8616             *cost += extra_cost->ldst.loadv;
8617           else if (GET_MODE_CLASS (mode) == MODE_INT)
8618             *cost += extra_cost->ldst.load;
8619           else if (mode == SFmode)
8620             *cost += extra_cost->ldst.loadf;
8621           else if (mode == DFmode)
8622             *cost += extra_cost->ldst.loadd;
8623
8624           *cost +=
8625                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8626                                                      0, speed));
8627         }
8628
8629       return true;
8630
8631     case NEG:
8632       op0 = XEXP (x, 0);
8633
8634       if (VECTOR_MODE_P (mode))
8635         {
8636           if (speed)
8637             {
8638               /* FNEG.  */
8639               *cost += extra_cost->vect.alu;
8640             }
8641           return false;
8642         }
8643
8644       if (GET_MODE_CLASS (mode) == MODE_INT)
8645         {
8646           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8647               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8648             {
8649               /* CSETM.  */
8650               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8651               return true;
8652             }
8653
8654           /* Cost this as SUB wzr, X.  */
8655           op0 = CONST0_RTX (mode);
8656           op1 = XEXP (x, 0);
8657           goto cost_minus;
8658         }
8659
8660       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8661         {
8662           /* Support (neg(fma...)) as a single instruction only if
8663              sign of zeros is unimportant.  This matches the decision
8664              making in aarch64.md.  */
8665           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8666             {
8667               /* FNMADD.  */
8668               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8669               return true;
8670             }
8671           if (GET_CODE (op0) == MULT)
8672             {
8673               /* FNMUL.  */
8674               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8675               return true;
8676             }
8677           if (speed)
8678             /* FNEG.  */
8679             *cost += extra_cost->fp[mode == DFmode].neg;
8680           return false;
8681         }
8682
8683       return false;
8684
8685     case CLRSB:
8686     case CLZ:
8687       if (speed)
8688         {
8689           if (VECTOR_MODE_P (mode))
8690             *cost += extra_cost->vect.alu;
8691           else
8692             *cost += extra_cost->alu.clz;
8693         }
8694
8695       return false;
8696
8697     case COMPARE:
8698       op0 = XEXP (x, 0);
8699       op1 = XEXP (x, 1);
8700
8701       if (op1 == const0_rtx
8702           && GET_CODE (op0) == AND)
8703         {
8704           x = op0;
8705           mode = GET_MODE (op0);
8706           goto cost_logic;
8707         }
8708
8709       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8710         {
8711           /* TODO: A write to the CC flags possibly costs extra, this
8712              needs encoding in the cost tables.  */
8713
8714           mode = GET_MODE (op0);
8715           /* ANDS.  */
8716           if (GET_CODE (op0) == AND)
8717             {
8718               x = op0;
8719               goto cost_logic;
8720             }
8721
8722           if (GET_CODE (op0) == PLUS)
8723             {
8724               /* ADDS (and CMN alias).  */
8725               x = op0;
8726               goto cost_plus;
8727             }
8728
8729           if (GET_CODE (op0) == MINUS)
8730             {
8731               /* SUBS.  */
8732               x = op0;
8733               goto cost_minus;
8734             }
8735
8736           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8737               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8738               && CONST_INT_P (XEXP (op0, 2)))
8739             {
8740               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8741                  Handle it here directly rather than going to cost_logic
8742                  since we know the immediate generated for the TST is valid
8743                  so we can avoid creating an intermediate rtx for it only
8744                  for costing purposes.  */
8745               if (speed)
8746                 *cost += extra_cost->alu.logical;
8747
8748               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8749                                  ZERO_EXTRACT, 0, speed);
8750               return true;
8751             }
8752
8753           if (GET_CODE (op1) == NEG)
8754             {
8755               /* CMN.  */
8756               if (speed)
8757                 *cost += extra_cost->alu.arith;
8758
8759               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8760               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8761               return true;
8762             }
8763
8764           /* CMP.
8765
8766              Compare can freely swap the order of operands, and
8767              canonicalization puts the more complex operation first.
8768              But the integer MINUS logic expects the shift/extend
8769              operation in op1.  */
8770           if (! (REG_P (op0)
8771                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8772           {
8773             op0 = XEXP (x, 1);
8774             op1 = XEXP (x, 0);
8775           }
8776           goto cost_minus;
8777         }
8778
8779       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8780         {
8781           /* FCMP.  */
8782           if (speed)
8783             *cost += extra_cost->fp[mode == DFmode].compare;
8784
8785           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8786             {
8787               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8788               /* FCMP supports constant 0.0 for no extra cost. */
8789               return true;
8790             }
8791           return false;
8792         }
8793
8794       if (VECTOR_MODE_P (mode))
8795         {
8796           /* Vector compare.  */
8797           if (speed)
8798             *cost += extra_cost->vect.alu;
8799
8800           if (aarch64_float_const_zero_rtx_p (op1))
8801             {
8802               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8803                  cost.  */
8804               return true;
8805             }
8806           return false;
8807         }
8808       return false;
8809
8810     case MINUS:
8811       {
8812         op0 = XEXP (x, 0);
8813         op1 = XEXP (x, 1);
8814
8815 cost_minus:
8816         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8817
8818         /* Detect valid immediates.  */
8819         if ((GET_MODE_CLASS (mode) == MODE_INT
8820              || (GET_MODE_CLASS (mode) == MODE_CC
8821                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8822             && CONST_INT_P (op1)
8823             && aarch64_uimm12_shift (INTVAL (op1)))
8824           {
8825             if (speed)
8826               /* SUB(S) (immediate).  */
8827               *cost += extra_cost->alu.arith;
8828             return true;
8829           }
8830
8831         /* Look for SUB (extended register).  */
8832         if (is_a <scalar_int_mode> (mode, &int_mode)
8833             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8834           {
8835             if (speed)
8836               *cost += extra_cost->alu.extend_arith;
8837
8838             op1 = aarch64_strip_extend (op1, true);
8839             *cost += rtx_cost (op1, VOIDmode,
8840                                (enum rtx_code) GET_CODE (op1), 0, speed);
8841             return true;
8842           }
8843
8844         rtx new_op1 = aarch64_strip_extend (op1, false);
8845
8846         /* Cost this as an FMA-alike operation.  */
8847         if ((GET_CODE (new_op1) == MULT
8848              || aarch64_shift_p (GET_CODE (new_op1)))
8849             && code != COMPARE)
8850           {
8851             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8852                                             (enum rtx_code) code,
8853                                             speed);
8854             return true;
8855           }
8856
8857         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8858
8859         if (speed)
8860           {
8861             if (VECTOR_MODE_P (mode))
8862               {
8863                 /* Vector SUB.  */
8864                 *cost += extra_cost->vect.alu;
8865               }
8866             else if (GET_MODE_CLASS (mode) == MODE_INT)
8867               {
8868                 /* SUB(S).  */
8869                 *cost += extra_cost->alu.arith;
8870               }
8871             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8872               {
8873                 /* FSUB.  */
8874                 *cost += extra_cost->fp[mode == DFmode].addsub;
8875               }
8876           }
8877         return true;
8878       }
8879
8880     case PLUS:
8881       {
8882         rtx new_op0;
8883
8884         op0 = XEXP (x, 0);
8885         op1 = XEXP (x, 1);
8886
8887 cost_plus:
8888         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8889             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8890           {
8891             /* CSINC.  */
8892             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8893             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8894             return true;
8895           }
8896
8897         if (GET_MODE_CLASS (mode) == MODE_INT
8898             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8899                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8900           {
8901             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8902
8903             if (speed)
8904               /* ADD (immediate).  */
8905               *cost += extra_cost->alu.arith;
8906             return true;
8907           }
8908
8909         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8910
8911         /* Look for ADD (extended register).  */
8912         if (is_a <scalar_int_mode> (mode, &int_mode)
8913             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8914           {
8915             if (speed)
8916               *cost += extra_cost->alu.extend_arith;
8917
8918             op0 = aarch64_strip_extend (op0, true);
8919             *cost += rtx_cost (op0, VOIDmode,
8920                                (enum rtx_code) GET_CODE (op0), 0, speed);
8921             return true;
8922           }
8923
8924         /* Strip any extend, leave shifts behind as we will
8925            cost them through mult_cost.  */
8926         new_op0 = aarch64_strip_extend (op0, false);
8927
8928         if (GET_CODE (new_op0) == MULT
8929             || aarch64_shift_p (GET_CODE (new_op0)))
8930           {
8931             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8932                                             speed);
8933             return true;
8934           }
8935
8936         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8937
8938         if (speed)
8939           {
8940             if (VECTOR_MODE_P (mode))
8941               {
8942                 /* Vector ADD.  */
8943                 *cost += extra_cost->vect.alu;
8944               }
8945             else if (GET_MODE_CLASS (mode) == MODE_INT)
8946               {
8947                 /* ADD.  */
8948                 *cost += extra_cost->alu.arith;
8949               }
8950             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8951               {
8952                 /* FADD.  */
8953                 *cost += extra_cost->fp[mode == DFmode].addsub;
8954               }
8955           }
8956         return true;
8957       }
8958
8959     case BSWAP:
8960       *cost = COSTS_N_INSNS (1);
8961
8962       if (speed)
8963         {
8964           if (VECTOR_MODE_P (mode))
8965             *cost += extra_cost->vect.alu;
8966           else
8967             *cost += extra_cost->alu.rev;
8968         }
8969       return false;
8970
8971     case IOR:
8972       if (aarch_rev16_p (x))
8973         {
8974           *cost = COSTS_N_INSNS (1);
8975
8976           if (speed)
8977             {
8978               if (VECTOR_MODE_P (mode))
8979                 *cost += extra_cost->vect.alu;
8980               else
8981                 *cost += extra_cost->alu.rev;
8982             }
8983           return true;
8984         }
8985
8986       if (aarch64_extr_rtx_p (x, &op0, &op1))
8987         {
8988           *cost += rtx_cost (op0, mode, IOR, 0, speed);
8989           *cost += rtx_cost (op1, mode, IOR, 1, speed);
8990           if (speed)
8991             *cost += extra_cost->alu.shift;
8992
8993           return true;
8994         }
8995     /* Fall through.  */
8996     case XOR:
8997     case AND:
8998     cost_logic:
8999       op0 = XEXP (x, 0);
9000       op1 = XEXP (x, 1);
9001
9002       if (VECTOR_MODE_P (mode))
9003         {
9004           if (speed)
9005             *cost += extra_cost->vect.alu;
9006           return true;
9007         }
9008
9009       if (code == AND
9010           && GET_CODE (op0) == MULT
9011           && CONST_INT_P (XEXP (op0, 1))
9012           && CONST_INT_P (op1)
9013           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9014                                INTVAL (op1)) != 0)
9015         {
9016           /* This is a UBFM/SBFM.  */
9017           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9018           if (speed)
9019             *cost += extra_cost->alu.bfx;
9020           return true;
9021         }
9022
9023       if (is_int_mode (mode, &int_mode))
9024         {
9025           if (CONST_INT_P (op1))
9026             {
9027               /* We have a mask + shift version of a UBFIZ
9028                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9029               if (GET_CODE (op0) == ASHIFT
9030                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9031                                                          XEXP (op0, 1)))
9032                 {
9033                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9034                                      (enum rtx_code) code, 0, speed);
9035                   if (speed)
9036                     *cost += extra_cost->alu.bfx;
9037
9038                   return true;
9039                 }
9040               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9041                 {
9042                 /* We possibly get the immediate for free, this is not
9043                    modelled.  */
9044                   *cost += rtx_cost (op0, int_mode,
9045                                      (enum rtx_code) code, 0, speed);
9046                   if (speed)
9047                     *cost += extra_cost->alu.logical;
9048
9049                   return true;
9050                 }
9051             }
9052           else
9053             {
9054               rtx new_op0 = op0;
9055
9056               /* Handle ORN, EON, or BIC.  */
9057               if (GET_CODE (op0) == NOT)
9058                 op0 = XEXP (op0, 0);
9059
9060               new_op0 = aarch64_strip_shift (op0);
9061
9062               /* If we had a shift on op0 then this is a logical-shift-
9063                  by-register/immediate operation.  Otherwise, this is just
9064                  a logical operation.  */
9065               if (speed)
9066                 {
9067                   if (new_op0 != op0)
9068                     {
9069                       /* Shift by immediate.  */
9070                       if (CONST_INT_P (XEXP (op0, 1)))
9071                         *cost += extra_cost->alu.log_shift;
9072                       else
9073                         *cost += extra_cost->alu.log_shift_reg;
9074                     }
9075                   else
9076                     *cost += extra_cost->alu.logical;
9077                 }
9078
9079               /* In both cases we want to cost both operands.  */
9080               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9081                                  0, speed);
9082               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9083                                  1, speed);
9084
9085               return true;
9086             }
9087         }
9088       return false;
9089
9090     case NOT:
9091       x = XEXP (x, 0);
9092       op0 = aarch64_strip_shift (x);
9093
9094       if (VECTOR_MODE_P (mode))
9095         {
9096           /* Vector NOT.  */
9097           *cost += extra_cost->vect.alu;
9098           return false;
9099         }
9100
9101       /* MVN-shifted-reg.  */
9102       if (op0 != x)
9103         {
9104           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9105
9106           if (speed)
9107             *cost += extra_cost->alu.log_shift;
9108
9109           return true;
9110         }
9111       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9112          Handle the second form here taking care that 'a' in the above can
9113          be a shift.  */
9114       else if (GET_CODE (op0) == XOR)
9115         {
9116           rtx newop0 = XEXP (op0, 0);
9117           rtx newop1 = XEXP (op0, 1);
9118           rtx op0_stripped = aarch64_strip_shift (newop0);
9119
9120           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9121           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9122
9123           if (speed)
9124             {
9125               if (op0_stripped != newop0)
9126                 *cost += extra_cost->alu.log_shift;
9127               else
9128                 *cost += extra_cost->alu.logical;
9129             }
9130
9131           return true;
9132         }
9133       /* MVN.  */
9134       if (speed)
9135         *cost += extra_cost->alu.logical;
9136
9137       return false;
9138
9139     case ZERO_EXTEND:
9140
9141       op0 = XEXP (x, 0);
9142       /* If a value is written in SI mode, then zero extended to DI
9143          mode, the operation will in general be free as a write to
9144          a 'w' register implicitly zeroes the upper bits of an 'x'
9145          register.  However, if this is
9146
9147            (set (reg) (zero_extend (reg)))
9148
9149          we must cost the explicit register move.  */
9150       if (mode == DImode
9151           && GET_MODE (op0) == SImode
9152           && outer == SET)
9153         {
9154           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9155
9156         /* If OP_COST is non-zero, then the cost of the zero extend
9157            is effectively the cost of the inner operation.  Otherwise
9158            we have a MOV instruction and we take the cost from the MOV
9159            itself.  This is true independently of whether we are
9160            optimizing for space or time.  */
9161           if (op_cost)
9162             *cost = op_cost;
9163
9164           return true;
9165         }
9166       else if (MEM_P (op0))
9167         {
9168           /* All loads can zero extend to any size for free.  */
9169           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9170           return true;
9171         }
9172
9173       op0 = aarch64_extend_bitfield_pattern_p (x);
9174       if (op0)
9175         {
9176           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9177           if (speed)
9178             *cost += extra_cost->alu.bfx;
9179           return true;
9180         }
9181
9182       if (speed)
9183         {
9184           if (VECTOR_MODE_P (mode))
9185             {
9186               /* UMOV.  */
9187               *cost += extra_cost->vect.alu;
9188             }
9189           else
9190             {
9191               /* We generate an AND instead of UXTB/UXTH.  */
9192               *cost += extra_cost->alu.logical;
9193             }
9194         }
9195       return false;
9196
9197     case SIGN_EXTEND:
9198       if (MEM_P (XEXP (x, 0)))
9199         {
9200           /* LDRSH.  */
9201           if (speed)
9202             {
9203               rtx address = XEXP (XEXP (x, 0), 0);
9204               *cost += extra_cost->ldst.load_sign_extend;
9205
9206               *cost +=
9207                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9208                                                      0, speed));
9209             }
9210           return true;
9211         }
9212
9213       op0 = aarch64_extend_bitfield_pattern_p (x);
9214       if (op0)
9215         {
9216           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9217           if (speed)
9218             *cost += extra_cost->alu.bfx;
9219           return true;
9220         }
9221
9222       if (speed)
9223         {
9224           if (VECTOR_MODE_P (mode))
9225             *cost += extra_cost->vect.alu;
9226           else
9227             *cost += extra_cost->alu.extend;
9228         }
9229       return false;
9230
9231     case ASHIFT:
9232       op0 = XEXP (x, 0);
9233       op1 = XEXP (x, 1);
9234
9235       if (CONST_INT_P (op1))
9236         {
9237           if (speed)
9238             {
9239               if (VECTOR_MODE_P (mode))
9240                 {
9241                   /* Vector shift (immediate).  */
9242                   *cost += extra_cost->vect.alu;
9243                 }
9244               else
9245                 {
9246                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9247                      aliases.  */
9248                   *cost += extra_cost->alu.shift;
9249                 }
9250             }
9251
9252           /* We can incorporate zero/sign extend for free.  */
9253           if (GET_CODE (op0) == ZERO_EXTEND
9254               || GET_CODE (op0) == SIGN_EXTEND)
9255             op0 = XEXP (op0, 0);
9256
9257           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9258           return true;
9259         }
9260       else
9261         {
9262           if (VECTOR_MODE_P (mode))
9263             {
9264               if (speed)
9265                 /* Vector shift (register).  */
9266                 *cost += extra_cost->vect.alu;
9267             }
9268           else
9269             {
9270               if (speed)
9271                 /* LSLV.  */
9272                 *cost += extra_cost->alu.shift_reg;
9273
9274               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9275                   && CONST_INT_P (XEXP (op1, 1))
9276                   && known_eq (INTVAL (XEXP (op1, 1)),
9277                                GET_MODE_BITSIZE (mode) - 1))
9278                 {
9279                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9280                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9281                      don't recurse into it.  */
9282                   return true;
9283                 }
9284             }
9285           return false;  /* All arguments need to be in registers.  */
9286         }
9287
9288     case ROTATE:
9289     case ROTATERT:
9290     case LSHIFTRT:
9291     case ASHIFTRT:
9292       op0 = XEXP (x, 0);
9293       op1 = XEXP (x, 1);
9294
9295       if (CONST_INT_P (op1))
9296         {
9297           /* ASR (immediate) and friends.  */
9298           if (speed)
9299             {
9300               if (VECTOR_MODE_P (mode))
9301                 *cost += extra_cost->vect.alu;
9302               else
9303                 *cost += extra_cost->alu.shift;
9304             }
9305
9306           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9307           return true;
9308         }
9309       else
9310         {
9311           if (VECTOR_MODE_P (mode))
9312             {
9313               if (speed)
9314                 /* Vector shift (register).  */
9315                 *cost += extra_cost->vect.alu;
9316             }
9317           else
9318             {
9319               if (speed)
9320                 /* ASR (register) and friends.  */
9321                 *cost += extra_cost->alu.shift_reg;
9322
9323               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9324                   && CONST_INT_P (XEXP (op1, 1))
9325                   && known_eq (INTVAL (XEXP (op1, 1)),
9326                                GET_MODE_BITSIZE (mode) - 1))
9327                 {
9328                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9329                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9330                      don't recurse into it.  */
9331                   return true;
9332                 }
9333             }
9334           return false;  /* All arguments need to be in registers.  */
9335         }
9336
9337     case SYMBOL_REF:
9338
9339       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9340           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9341         {
9342           /* LDR.  */
9343           if (speed)
9344             *cost += extra_cost->ldst.load;
9345         }
9346       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9347                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9348         {
9349           /* ADRP, followed by ADD.  */
9350           *cost += COSTS_N_INSNS (1);
9351           if (speed)
9352             *cost += 2 * extra_cost->alu.arith;
9353         }
9354       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9355                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9356         {
9357           /* ADR.  */
9358           if (speed)
9359             *cost += extra_cost->alu.arith;
9360         }
9361
9362       if (flag_pic)
9363         {
9364           /* One extra load instruction, after accessing the GOT.  */
9365           *cost += COSTS_N_INSNS (1);
9366           if (speed)
9367             *cost += extra_cost->ldst.load;
9368         }
9369       return true;
9370
9371     case HIGH:
9372     case LO_SUM:
9373       /* ADRP/ADD (immediate).  */
9374       if (speed)
9375         *cost += extra_cost->alu.arith;
9376       return true;
9377
9378     case ZERO_EXTRACT:
9379     case SIGN_EXTRACT:
9380       /* UBFX/SBFX.  */
9381       if (speed)
9382         {
9383           if (VECTOR_MODE_P (mode))
9384             *cost += extra_cost->vect.alu;
9385           else
9386             *cost += extra_cost->alu.bfx;
9387         }
9388
9389       /* We can trust that the immediates used will be correct (there
9390          are no by-register forms), so we need only cost op0.  */
9391       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9392       return true;
9393
9394     case MULT:
9395       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9396       /* aarch64_rtx_mult_cost always handles recursion to its
9397          operands.  */
9398       return true;
9399
9400     case MOD:
9401     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9402        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9403        an unconditional negate.  This case should only ever be reached through
9404        the set_smod_pow2_cheap check in expmed.c.  */
9405       if (CONST_INT_P (XEXP (x, 1))
9406           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9407           && (mode == SImode || mode == DImode))
9408         {
9409           /* We expand to 4 instructions.  Reset the baseline.  */
9410           *cost = COSTS_N_INSNS (4);
9411
9412           if (speed)
9413             *cost += 2 * extra_cost->alu.logical
9414                      + 2 * extra_cost->alu.arith;
9415
9416           return true;
9417         }
9418
9419     /* Fall-through.  */
9420     case UMOD:
9421       if (speed)
9422         {
9423           /* Slighly prefer UMOD over SMOD.  */
9424           if (VECTOR_MODE_P (mode))
9425             *cost += extra_cost->vect.alu;
9426           else if (GET_MODE_CLASS (mode) == MODE_INT)
9427             *cost += (extra_cost->mult[mode == DImode].add
9428                       + extra_cost->mult[mode == DImode].idiv
9429                       + (code == MOD ? 1 : 0));
9430         }
9431       return false;  /* All arguments need to be in registers.  */
9432
9433     case DIV:
9434     case UDIV:
9435     case SQRT:
9436       if (speed)
9437         {
9438           if (VECTOR_MODE_P (mode))
9439             *cost += extra_cost->vect.alu;
9440           else if (GET_MODE_CLASS (mode) == MODE_INT)
9441             /* There is no integer SQRT, so only DIV and UDIV can get
9442                here.  */
9443             *cost += (extra_cost->mult[mode == DImode].idiv
9444                      /* Slighly prefer UDIV over SDIV.  */
9445                      + (code == DIV ? 1 : 0));
9446           else
9447             *cost += extra_cost->fp[mode == DFmode].div;
9448         }
9449       return false;  /* All arguments need to be in registers.  */
9450
9451     case IF_THEN_ELSE:
9452       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9453                                          XEXP (x, 2), cost, speed);
9454
9455     case EQ:
9456     case NE:
9457     case GT:
9458     case GTU:
9459     case LT:
9460     case LTU:
9461     case GE:
9462     case GEU:
9463     case LE:
9464     case LEU:
9465
9466       return false; /* All arguments must be in registers.  */
9467
9468     case FMA:
9469       op0 = XEXP (x, 0);
9470       op1 = XEXP (x, 1);
9471       op2 = XEXP (x, 2);
9472
9473       if (speed)
9474         {
9475           if (VECTOR_MODE_P (mode))
9476             *cost += extra_cost->vect.alu;
9477           else
9478             *cost += extra_cost->fp[mode == DFmode].fma;
9479         }
9480
9481       /* FMSUB, FNMADD, and FNMSUB are free.  */
9482       if (GET_CODE (op0) == NEG)
9483         op0 = XEXP (op0, 0);
9484
9485       if (GET_CODE (op2) == NEG)
9486         op2 = XEXP (op2, 0);
9487
9488       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9489          and the by-element operand as operand 0.  */
9490       if (GET_CODE (op1) == NEG)
9491         op1 = XEXP (op1, 0);
9492
9493       /* Catch vector-by-element operations.  The by-element operand can
9494          either be (vec_duplicate (vec_select (x))) or just
9495          (vec_select (x)), depending on whether we are multiplying by
9496          a vector or a scalar.
9497
9498          Canonicalization is not very good in these cases, FMA4 will put the
9499          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9500       if (GET_CODE (op0) == VEC_DUPLICATE)
9501         op0 = XEXP (op0, 0);
9502       else if (GET_CODE (op1) == VEC_DUPLICATE)
9503         op1 = XEXP (op1, 0);
9504
9505       if (GET_CODE (op0) == VEC_SELECT)
9506         op0 = XEXP (op0, 0);
9507       else if (GET_CODE (op1) == VEC_SELECT)
9508         op1 = XEXP (op1, 0);
9509
9510       /* If the remaining parameters are not registers,
9511          get the cost to put them into registers.  */
9512       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9513       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9514       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9515       return true;
9516
9517     case FLOAT:
9518     case UNSIGNED_FLOAT:
9519       if (speed)
9520         *cost += extra_cost->fp[mode == DFmode].fromint;
9521       return false;
9522
9523     case FLOAT_EXTEND:
9524       if (speed)
9525         {
9526           if (VECTOR_MODE_P (mode))
9527             {
9528               /*Vector truncate.  */
9529               *cost += extra_cost->vect.alu;
9530             }
9531           else
9532             *cost += extra_cost->fp[mode == DFmode].widen;
9533         }
9534       return false;
9535
9536     case FLOAT_TRUNCATE:
9537       if (speed)
9538         {
9539           if (VECTOR_MODE_P (mode))
9540             {
9541               /*Vector conversion.  */
9542               *cost += extra_cost->vect.alu;
9543             }
9544           else
9545             *cost += extra_cost->fp[mode == DFmode].narrow;
9546         }
9547       return false;
9548
9549     case FIX:
9550     case UNSIGNED_FIX:
9551       x = XEXP (x, 0);
9552       /* Strip the rounding part.  They will all be implemented
9553          by the fcvt* family of instructions anyway.  */
9554       if (GET_CODE (x) == UNSPEC)
9555         {
9556           unsigned int uns_code = XINT (x, 1);
9557
9558           if (uns_code == UNSPEC_FRINTA
9559               || uns_code == UNSPEC_FRINTM
9560               || uns_code == UNSPEC_FRINTN
9561               || uns_code == UNSPEC_FRINTP
9562               || uns_code == UNSPEC_FRINTZ)
9563             x = XVECEXP (x, 0, 0);
9564         }
9565
9566       if (speed)
9567         {
9568           if (VECTOR_MODE_P (mode))
9569             *cost += extra_cost->vect.alu;
9570           else
9571             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9572         }
9573
9574       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9575          fixed-point fcvt.  */
9576       if (GET_CODE (x) == MULT
9577           && ((VECTOR_MODE_P (mode)
9578                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9579               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9580         {
9581           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9582                              0, speed);
9583           return true;
9584         }
9585
9586       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9587       return true;
9588
9589     case ABS:
9590       if (VECTOR_MODE_P (mode))
9591         {
9592           /* ABS (vector).  */
9593           if (speed)
9594             *cost += extra_cost->vect.alu;
9595         }
9596       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9597         {
9598           op0 = XEXP (x, 0);
9599
9600           /* FABD, which is analogous to FADD.  */
9601           if (GET_CODE (op0) == MINUS)
9602             {
9603               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9604               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9605               if (speed)
9606                 *cost += extra_cost->fp[mode == DFmode].addsub;
9607
9608               return true;
9609             }
9610           /* Simple FABS is analogous to FNEG.  */
9611           if (speed)
9612             *cost += extra_cost->fp[mode == DFmode].neg;
9613         }
9614       else
9615         {
9616           /* Integer ABS will either be split to
9617              two arithmetic instructions, or will be an ABS
9618              (scalar), which we don't model.  */
9619           *cost = COSTS_N_INSNS (2);
9620           if (speed)
9621             *cost += 2 * extra_cost->alu.arith;
9622         }
9623       return false;
9624
9625     case SMAX:
9626     case SMIN:
9627       if (speed)
9628         {
9629           if (VECTOR_MODE_P (mode))
9630             *cost += extra_cost->vect.alu;
9631           else
9632             {
9633               /* FMAXNM/FMINNM/FMAX/FMIN.
9634                  TODO: This may not be accurate for all implementations, but
9635                  we do not model this in the cost tables.  */
9636               *cost += extra_cost->fp[mode == DFmode].addsub;
9637             }
9638         }
9639       return false;
9640
9641     case UNSPEC:
9642       /* The floating point round to integer frint* instructions.  */
9643       if (aarch64_frint_unspec_p (XINT (x, 1)))
9644         {
9645           if (speed)
9646             *cost += extra_cost->fp[mode == DFmode].roundint;
9647
9648           return false;
9649         }
9650
9651       if (XINT (x, 1) == UNSPEC_RBIT)
9652         {
9653           if (speed)
9654             *cost += extra_cost->alu.rev;
9655
9656           return false;
9657         }
9658       break;
9659
9660     case TRUNCATE:
9661
9662       /* Decompose <su>muldi3_highpart.  */
9663       if (/* (truncate:DI  */
9664           mode == DImode
9665           /*   (lshiftrt:TI  */
9666           && GET_MODE (XEXP (x, 0)) == TImode
9667           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9668           /*      (mult:TI  */
9669           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9670           /*        (ANY_EXTEND:TI (reg:DI))
9671                     (ANY_EXTEND:TI (reg:DI)))  */
9672           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9673                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9674               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9675                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9676           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9677           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9678           /*     (const_int 64)  */
9679           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9680           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9681         {
9682           /* UMULH/SMULH.  */
9683           if (speed)
9684             *cost += extra_cost->mult[mode == DImode].extend;
9685           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9686                              mode, MULT, 0, speed);
9687           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9688                              mode, MULT, 1, speed);
9689           return true;
9690         }
9691
9692       /* Fall through.  */
9693     default:
9694       break;
9695     }
9696
9697   if (dump_file
9698       && flag_aarch64_verbose_cost)
9699     fprintf (dump_file,
9700       "\nFailed to cost RTX.  Assuming default cost.\n");
9701
9702   return true;
9703 }
9704
9705 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9706    calculated for X.  This cost is stored in *COST.  Returns true
9707    if the total cost of X was calculated.  */
9708 static bool
9709 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9710                    int param, int *cost, bool speed)
9711 {
9712   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9713
9714   if (dump_file
9715       && flag_aarch64_verbose_cost)
9716     {
9717       print_rtl_single (dump_file, x);
9718       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9719                speed ? "Hot" : "Cold",
9720                *cost, result ? "final" : "partial");
9721     }
9722
9723   return result;
9724 }
9725
9726 static int
9727 aarch64_register_move_cost (machine_mode mode,
9728                             reg_class_t from_i, reg_class_t to_i)
9729 {
9730   enum reg_class from = (enum reg_class) from_i;
9731   enum reg_class to = (enum reg_class) to_i;
9732   const struct cpu_regmove_cost *regmove_cost
9733     = aarch64_tune_params.regmove_cost;
9734
9735   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9736   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9737     to = GENERAL_REGS;
9738
9739   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9740     from = GENERAL_REGS;
9741
9742   /* Moving between GPR and stack cost is the same as GP2GP.  */
9743   if ((from == GENERAL_REGS && to == STACK_REG)
9744       || (to == GENERAL_REGS && from == STACK_REG))
9745     return regmove_cost->GP2GP;
9746
9747   /* To/From the stack register, we move via the gprs.  */
9748   if (to == STACK_REG || from == STACK_REG)
9749     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9750             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9751
9752   if (known_eq (GET_MODE_SIZE (mode), 16))
9753     {
9754       /* 128-bit operations on general registers require 2 instructions.  */
9755       if (from == GENERAL_REGS && to == GENERAL_REGS)
9756         return regmove_cost->GP2GP * 2;
9757       else if (from == GENERAL_REGS)
9758         return regmove_cost->GP2FP * 2;
9759       else if (to == GENERAL_REGS)
9760         return regmove_cost->FP2GP * 2;
9761
9762       /* When AdvSIMD instructions are disabled it is not possible to move
9763          a 128-bit value directly between Q registers.  This is handled in
9764          secondary reload.  A general register is used as a scratch to move
9765          the upper DI value and the lower DI value is moved directly,
9766          hence the cost is the sum of three moves. */
9767       if (! TARGET_SIMD)
9768         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9769
9770       return regmove_cost->FP2FP;
9771     }
9772
9773   if (from == GENERAL_REGS && to == GENERAL_REGS)
9774     return regmove_cost->GP2GP;
9775   else if (from == GENERAL_REGS)
9776     return regmove_cost->GP2FP;
9777   else if (to == GENERAL_REGS)
9778     return regmove_cost->FP2GP;
9779
9780   return regmove_cost->FP2FP;
9781 }
9782
9783 static int
9784 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9785                           reg_class_t rclass ATTRIBUTE_UNUSED,
9786                           bool in ATTRIBUTE_UNUSED)
9787 {
9788   return aarch64_tune_params.memmov_cost;
9789 }
9790
9791 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9792    to optimize 1.0/sqrt.  */
9793
9794 static bool
9795 use_rsqrt_p (machine_mode mode)
9796 {
9797   return (!flag_trapping_math
9798           && flag_unsafe_math_optimizations
9799           && ((aarch64_tune_params.approx_modes->recip_sqrt
9800                & AARCH64_APPROX_MODE (mode))
9801               || flag_mrecip_low_precision_sqrt));
9802 }
9803
9804 /* Function to decide when to use the approximate reciprocal square root
9805    builtin.  */
9806
9807 static tree
9808 aarch64_builtin_reciprocal (tree fndecl)
9809 {
9810   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9811
9812   if (!use_rsqrt_p (mode))
9813     return NULL_TREE;
9814   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9815 }
9816
9817 typedef rtx (*rsqrte_type) (rtx, rtx);
9818
9819 /* Select reciprocal square root initial estimate insn depending on machine
9820    mode.  */
9821
9822 static rsqrte_type
9823 get_rsqrte_type (machine_mode mode)
9824 {
9825   switch (mode)
9826   {
9827     case E_DFmode:   return gen_aarch64_rsqrtedf;
9828     case E_SFmode:   return gen_aarch64_rsqrtesf;
9829     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9830     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9831     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9832     default: gcc_unreachable ();
9833   }
9834 }
9835
9836 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9837
9838 /* Select reciprocal square root series step insn depending on machine mode.  */
9839
9840 static rsqrts_type
9841 get_rsqrts_type (machine_mode mode)
9842 {
9843   switch (mode)
9844   {
9845     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9846     case E_SFmode:   return gen_aarch64_rsqrtssf;
9847     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9848     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9849     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9850     default: gcc_unreachable ();
9851   }
9852 }
9853
9854 /* Emit instruction sequence to compute either the approximate square root
9855    or its approximate reciprocal, depending on the flag RECP, and return
9856    whether the sequence was emitted or not.  */
9857
9858 bool
9859 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9860 {
9861   machine_mode mode = GET_MODE (dst);
9862
9863   if (GET_MODE_INNER (mode) == HFmode)
9864     {
9865       gcc_assert (!recp);
9866       return false;
9867     }
9868
9869   if (!recp)
9870     {
9871       if (!(flag_mlow_precision_sqrt
9872             || (aarch64_tune_params.approx_modes->sqrt
9873                 & AARCH64_APPROX_MODE (mode))))
9874         return false;
9875
9876       if (flag_finite_math_only
9877           || flag_trapping_math
9878           || !flag_unsafe_math_optimizations
9879           || optimize_function_for_size_p (cfun))
9880         return false;
9881     }
9882   else
9883     /* Caller assumes we cannot fail.  */
9884     gcc_assert (use_rsqrt_p (mode));
9885
9886   machine_mode mmsk = mode_for_int_vector (mode).require ();
9887   rtx xmsk = gen_reg_rtx (mmsk);
9888   if (!recp)
9889     /* When calculating the approximate square root, compare the
9890        argument with 0.0 and create a mask.  */
9891     emit_insn (gen_rtx_SET (xmsk,
9892                             gen_rtx_NEG (mmsk,
9893                                          gen_rtx_EQ (mmsk, src,
9894                                                      CONST0_RTX (mode)))));
9895
9896   /* Estimate the approximate reciprocal square root.  */
9897   rtx xdst = gen_reg_rtx (mode);
9898   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9899
9900   /* Iterate over the series twice for SF and thrice for DF.  */
9901   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9902
9903   /* Optionally iterate over the series once less for faster performance
9904      while sacrificing the accuracy.  */
9905   if ((recp && flag_mrecip_low_precision_sqrt)
9906       || (!recp && flag_mlow_precision_sqrt))
9907     iterations--;
9908
9909   /* Iterate over the series to calculate the approximate reciprocal square
9910      root.  */
9911   rtx x1 = gen_reg_rtx (mode);
9912   while (iterations--)
9913     {
9914       rtx x2 = gen_reg_rtx (mode);
9915       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9916
9917       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9918
9919       if (iterations > 0)
9920         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9921     }
9922
9923   if (!recp)
9924     {
9925       /* Qualify the approximate reciprocal square root when the argument is
9926          0.0 by squashing the intermediary result to 0.0.  */
9927       rtx xtmp = gen_reg_rtx (mmsk);
9928       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9929                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9930       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9931
9932       /* Calculate the approximate square root.  */
9933       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9934     }
9935
9936   /* Finalize the approximation.  */
9937   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9938
9939   return true;
9940 }
9941
9942 typedef rtx (*recpe_type) (rtx, rtx);
9943
9944 /* Select reciprocal initial estimate insn depending on machine mode.  */
9945
9946 static recpe_type
9947 get_recpe_type (machine_mode mode)
9948 {
9949   switch (mode)
9950   {
9951     case E_SFmode:   return (gen_aarch64_frecpesf);
9952     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9953     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9954     case E_DFmode:   return (gen_aarch64_frecpedf);
9955     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9956     default:         gcc_unreachable ();
9957   }
9958 }
9959
9960 typedef rtx (*recps_type) (rtx, rtx, rtx);
9961
9962 /* Select reciprocal series step insn depending on machine mode.  */
9963
9964 static recps_type
9965 get_recps_type (machine_mode mode)
9966 {
9967   switch (mode)
9968   {
9969     case E_SFmode:   return (gen_aarch64_frecpssf);
9970     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9971     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9972     case E_DFmode:   return (gen_aarch64_frecpsdf);
9973     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9974     default:         gcc_unreachable ();
9975   }
9976 }
9977
9978 /* Emit the instruction sequence to compute the approximation for the division
9979    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9980
9981 bool
9982 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9983 {
9984   machine_mode mode = GET_MODE (quo);
9985
9986   if (GET_MODE_INNER (mode) == HFmode)
9987     return false;
9988
9989   bool use_approx_division_p = (flag_mlow_precision_div
9990                                 || (aarch64_tune_params.approx_modes->division
9991                                     & AARCH64_APPROX_MODE (mode)));
9992
9993   if (!flag_finite_math_only
9994       || flag_trapping_math
9995       || !flag_unsafe_math_optimizations
9996       || optimize_function_for_size_p (cfun)
9997       || !use_approx_division_p)
9998     return false;
9999
10000   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10001     return false;
10002
10003   /* Estimate the approximate reciprocal.  */
10004   rtx xrcp = gen_reg_rtx (mode);
10005   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10006
10007   /* Iterate over the series twice for SF and thrice for DF.  */
10008   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10009
10010   /* Optionally iterate over the series once less for faster performance,
10011      while sacrificing the accuracy.  */
10012   if (flag_mlow_precision_div)
10013     iterations--;
10014
10015   /* Iterate over the series to calculate the approximate reciprocal.  */
10016   rtx xtmp = gen_reg_rtx (mode);
10017   while (iterations--)
10018     {
10019       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10020
10021       if (iterations > 0)
10022         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10023     }
10024
10025   if (num != CONST1_RTX (mode))
10026     {
10027       /* As the approximate reciprocal of DEN is already calculated, only
10028          calculate the approximate division when NUM is not 1.0.  */
10029       rtx xnum = force_reg (mode, num);
10030       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10031     }
10032
10033   /* Finalize the approximation.  */
10034   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10035   return true;
10036 }
10037
10038 /* Return the number of instructions that can be issued per cycle.  */
10039 static int
10040 aarch64_sched_issue_rate (void)
10041 {
10042   return aarch64_tune_params.issue_rate;
10043 }
10044
10045 static int
10046 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10047 {
10048   int issue_rate = aarch64_sched_issue_rate ();
10049
10050   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10051 }
10052
10053
10054 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10055    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10056    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10057
10058 static int
10059 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10060                                                     int ready_index)
10061 {
10062   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10063 }
10064
10065
10066 /* Vectorizer cost model target hooks.  */
10067
10068 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10069 static int
10070 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10071                                     tree vectype,
10072                                     int misalign ATTRIBUTE_UNUSED)
10073 {
10074   unsigned elements;
10075   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10076   bool fp = false;
10077
10078   if (vectype != NULL)
10079     fp = FLOAT_TYPE_P (vectype);
10080
10081   switch (type_of_cost)
10082     {
10083       case scalar_stmt:
10084         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10085
10086       case scalar_load:
10087         return costs->scalar_load_cost;
10088
10089       case scalar_store:
10090         return costs->scalar_store_cost;
10091
10092       case vector_stmt:
10093         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10094
10095       case vector_load:
10096         return costs->vec_align_load_cost;
10097
10098       case vector_store:
10099         return costs->vec_store_cost;
10100
10101       case vec_to_scalar:
10102         return costs->vec_to_scalar_cost;
10103
10104       case scalar_to_vec:
10105         return costs->scalar_to_vec_cost;
10106
10107       case unaligned_load:
10108       case vector_gather_load:
10109         return costs->vec_unalign_load_cost;
10110
10111       case unaligned_store:
10112       case vector_scatter_store:
10113         return costs->vec_unalign_store_cost;
10114
10115       case cond_branch_taken:
10116         return costs->cond_taken_branch_cost;
10117
10118       case cond_branch_not_taken:
10119         return costs->cond_not_taken_branch_cost;
10120
10121       case vec_perm:
10122         return costs->vec_permute_cost;
10123
10124       case vec_promote_demote:
10125         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10126
10127       case vec_construct:
10128         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10129         return elements / 2 + 1;
10130
10131       default:
10132         gcc_unreachable ();
10133     }
10134 }
10135
10136 /* Implement targetm.vectorize.add_stmt_cost.  */
10137 static unsigned
10138 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10139                        struct _stmt_vec_info *stmt_info, int misalign,
10140                        enum vect_cost_model_location where)
10141 {
10142   unsigned *cost = (unsigned *) data;
10143   unsigned retval = 0;
10144
10145   if (flag_vect_cost_model)
10146     {
10147       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10148       int stmt_cost =
10149             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10150
10151       /* Statements in an inner loop relative to the loop being
10152          vectorized are weighted more heavily.  The value here is
10153          arbitrary and could potentially be improved with analysis.  */
10154       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10155         count *= 50; /*  FIXME  */
10156
10157       retval = (unsigned) (count * stmt_cost);
10158       cost[where] += retval;
10159     }
10160
10161   return retval;
10162 }
10163
10164 static void initialize_aarch64_code_model (struct gcc_options *);
10165
10166 /* Parse the TO_PARSE string and put the architecture struct that it
10167    selects into RES and the architectural features into ISA_FLAGS.
10168    Return an aarch64_parse_opt_result describing the parse result.
10169    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10170
10171 static enum aarch64_parse_opt_result
10172 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10173                     unsigned long *isa_flags)
10174 {
10175   char *ext;
10176   const struct processor *arch;
10177   char *str = (char *) alloca (strlen (to_parse) + 1);
10178   size_t len;
10179
10180   strcpy (str, to_parse);
10181
10182   ext = strchr (str, '+');
10183
10184   if (ext != NULL)
10185     len = ext - str;
10186   else
10187     len = strlen (str);
10188
10189   if (len == 0)
10190     return AARCH64_PARSE_MISSING_ARG;
10191
10192
10193   /* Loop through the list of supported ARCHes to find a match.  */
10194   for (arch = all_architectures; arch->name != NULL; arch++)
10195     {
10196       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10197         {
10198           unsigned long isa_temp = arch->flags;
10199
10200           if (ext != NULL)
10201             {
10202               /* TO_PARSE string contains at least one extension.  */
10203               enum aarch64_parse_opt_result ext_res
10204                 = aarch64_parse_extension (ext, &isa_temp);
10205
10206               if (ext_res != AARCH64_PARSE_OK)
10207                 return ext_res;
10208             }
10209           /* Extension parsing was successful.  Confirm the result
10210              arch and ISA flags.  */
10211           *res = arch;
10212           *isa_flags = isa_temp;
10213           return AARCH64_PARSE_OK;
10214         }
10215     }
10216
10217   /* ARCH name not found in list.  */
10218   return AARCH64_PARSE_INVALID_ARG;
10219 }
10220
10221 /* Parse the TO_PARSE string and put the result tuning in RES and the
10222    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10223    describing the parse result.  If there is an error parsing, RES and
10224    ISA_FLAGS are left unchanged.  */
10225
10226 static enum aarch64_parse_opt_result
10227 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10228                    unsigned long *isa_flags)
10229 {
10230   char *ext;
10231   const struct processor *cpu;
10232   char *str = (char *) alloca (strlen (to_parse) + 1);
10233   size_t len;
10234
10235   strcpy (str, to_parse);
10236
10237   ext = strchr (str, '+');
10238
10239   if (ext != NULL)
10240     len = ext - str;
10241   else
10242     len = strlen (str);
10243
10244   if (len == 0)
10245     return AARCH64_PARSE_MISSING_ARG;
10246
10247
10248   /* Loop through the list of supported CPUs to find a match.  */
10249   for (cpu = all_cores; cpu->name != NULL; cpu++)
10250     {
10251       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10252         {
10253           unsigned long isa_temp = cpu->flags;
10254
10255
10256           if (ext != NULL)
10257             {
10258               /* TO_PARSE string contains at least one extension.  */
10259               enum aarch64_parse_opt_result ext_res
10260                 = aarch64_parse_extension (ext, &isa_temp);
10261
10262               if (ext_res != AARCH64_PARSE_OK)
10263                 return ext_res;
10264             }
10265           /* Extension parsing was successfull.  Confirm the result
10266              cpu and ISA flags.  */
10267           *res = cpu;
10268           *isa_flags = isa_temp;
10269           return AARCH64_PARSE_OK;
10270         }
10271     }
10272
10273   /* CPU name not found in list.  */
10274   return AARCH64_PARSE_INVALID_ARG;
10275 }
10276
10277 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10278    Return an aarch64_parse_opt_result describing the parse result.
10279    If the parsing fails the RES does not change.  */
10280
10281 static enum aarch64_parse_opt_result
10282 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10283 {
10284   const struct processor *cpu;
10285   char *str = (char *) alloca (strlen (to_parse) + 1);
10286
10287   strcpy (str, to_parse);
10288
10289   /* Loop through the list of supported CPUs to find a match.  */
10290   for (cpu = all_cores; cpu->name != NULL; cpu++)
10291     {
10292       if (strcmp (cpu->name, str) == 0)
10293         {
10294           *res = cpu;
10295           return AARCH64_PARSE_OK;
10296         }
10297     }
10298
10299   /* CPU name not found in list.  */
10300   return AARCH64_PARSE_INVALID_ARG;
10301 }
10302
10303 /* Parse TOKEN, which has length LENGTH to see if it is an option
10304    described in FLAG.  If it is, return the index bit for that fusion type.
10305    If not, error (printing OPTION_NAME) and return zero.  */
10306
10307 static unsigned int
10308 aarch64_parse_one_option_token (const char *token,
10309                                 size_t length,
10310                                 const struct aarch64_flag_desc *flag,
10311                                 const char *option_name)
10312 {
10313   for (; flag->name != NULL; flag++)
10314     {
10315       if (length == strlen (flag->name)
10316           && !strncmp (flag->name, token, length))
10317         return flag->flag;
10318     }
10319
10320   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10321   return 0;
10322 }
10323
10324 /* Parse OPTION which is a comma-separated list of flags to enable.
10325    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10326    default state we inherit from the CPU tuning structures.  OPTION_NAME
10327    gives the top-level option we are parsing in the -moverride string,
10328    for use in error messages.  */
10329
10330 static unsigned int
10331 aarch64_parse_boolean_options (const char *option,
10332                                const struct aarch64_flag_desc *flags,
10333                                unsigned int initial_state,
10334                                const char *option_name)
10335 {
10336   const char separator = '.';
10337   const char* specs = option;
10338   const char* ntoken = option;
10339   unsigned int found_flags = initial_state;
10340
10341   while ((ntoken = strchr (specs, separator)))
10342     {
10343       size_t token_length = ntoken - specs;
10344       unsigned token_ops = aarch64_parse_one_option_token (specs,
10345                                                            token_length,
10346                                                            flags,
10347                                                            option_name);
10348       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10349          in the token stream, reset the supported operations.  So:
10350
10351            adrp+add.cmp+branch.none.adrp+add
10352
10353            would have the result of turning on only adrp+add fusion.  */
10354       if (!token_ops)
10355         found_flags = 0;
10356
10357       found_flags |= token_ops;
10358       specs = ++ntoken;
10359     }
10360
10361   /* We ended with a comma, print something.  */
10362   if (!(*specs))
10363     {
10364       error ("%s string ill-formed\n", option_name);
10365       return 0;
10366     }
10367
10368   /* We still have one more token to parse.  */
10369   size_t token_length = strlen (specs);
10370   unsigned token_ops = aarch64_parse_one_option_token (specs,
10371                                                        token_length,
10372                                                        flags,
10373                                                        option_name);
10374    if (!token_ops)
10375      found_flags = 0;
10376
10377   found_flags |= token_ops;
10378   return found_flags;
10379 }
10380
10381 /* Support for overriding instruction fusion.  */
10382
10383 static void
10384 aarch64_parse_fuse_string (const char *fuse_string,
10385                             struct tune_params *tune)
10386 {
10387   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10388                                                      aarch64_fusible_pairs,
10389                                                      tune->fusible_ops,
10390                                                      "fuse=");
10391 }
10392
10393 /* Support for overriding other tuning flags.  */
10394
10395 static void
10396 aarch64_parse_tune_string (const char *tune_string,
10397                             struct tune_params *tune)
10398 {
10399   tune->extra_tuning_flags
10400     = aarch64_parse_boolean_options (tune_string,
10401                                      aarch64_tuning_flags,
10402                                      tune->extra_tuning_flags,
10403                                      "tune=");
10404 }
10405
10406 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10407    we understand.  If it is, extract the option string and handoff to
10408    the appropriate function.  */
10409
10410 void
10411 aarch64_parse_one_override_token (const char* token,
10412                                   size_t length,
10413                                   struct tune_params *tune)
10414 {
10415   const struct aarch64_tuning_override_function *fn
10416     = aarch64_tuning_override_functions;
10417
10418   const char *option_part = strchr (token, '=');
10419   if (!option_part)
10420     {
10421       error ("tuning string missing in option (%s)", token);
10422       return;
10423     }
10424
10425   /* Get the length of the option name.  */
10426   length = option_part - token;
10427   /* Skip the '=' to get to the option string.  */
10428   option_part++;
10429
10430   for (; fn->name != NULL; fn++)
10431     {
10432       if (!strncmp (fn->name, token, length))
10433         {
10434           fn->parse_override (option_part, tune);
10435           return;
10436         }
10437     }
10438
10439   error ("unknown tuning option (%s)",token);
10440   return;
10441 }
10442
10443 /* A checking mechanism for the implementation of the tls size.  */
10444
10445 static void
10446 initialize_aarch64_tls_size (struct gcc_options *opts)
10447 {
10448   if (aarch64_tls_size == 0)
10449     aarch64_tls_size = 24;
10450
10451   switch (opts->x_aarch64_cmodel_var)
10452     {
10453     case AARCH64_CMODEL_TINY:
10454       /* Both the default and maximum TLS size allowed under tiny is 1M which
10455          needs two instructions to address, so we clamp the size to 24.  */
10456       if (aarch64_tls_size > 24)
10457         aarch64_tls_size = 24;
10458       break;
10459     case AARCH64_CMODEL_SMALL:
10460       /* The maximum TLS size allowed under small is 4G.  */
10461       if (aarch64_tls_size > 32)
10462         aarch64_tls_size = 32;
10463       break;
10464     case AARCH64_CMODEL_LARGE:
10465       /* The maximum TLS size allowed under large is 16E.
10466          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10467       if (aarch64_tls_size > 48)
10468         aarch64_tls_size = 48;
10469       break;
10470     default:
10471       gcc_unreachable ();
10472     }
10473
10474   return;
10475 }
10476
10477 /* Parse STRING looking for options in the format:
10478      string     :: option:string
10479      option     :: name=substring
10480      name       :: {a-z}
10481      substring  :: defined by option.  */
10482
10483 static void
10484 aarch64_parse_override_string (const char* input_string,
10485                                struct tune_params* tune)
10486 {
10487   const char separator = ':';
10488   size_t string_length = strlen (input_string) + 1;
10489   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10490   char *string = string_root;
10491   strncpy (string, input_string, string_length);
10492   string[string_length - 1] = '\0';
10493
10494   char* ntoken = string;
10495
10496   while ((ntoken = strchr (string, separator)))
10497     {
10498       size_t token_length = ntoken - string;
10499       /* Make this substring look like a string.  */
10500       *ntoken = '\0';
10501       aarch64_parse_one_override_token (string, token_length, tune);
10502       string = ++ntoken;
10503     }
10504
10505   /* One last option to parse.  */
10506   aarch64_parse_one_override_token (string, strlen (string), tune);
10507   free (string_root);
10508 }
10509
10510
10511 static void
10512 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10513 {
10514   /* PR 70044: We have to be careful about being called multiple times for the
10515      same function.  This means all changes should be repeatable.  */
10516
10517   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10518      Disable the frame pointer flag so the mid-end will not use a frame
10519      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10520      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10521      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
10522   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10523   if (opts->x_flag_omit_frame_pointer == 0)
10524     opts->x_flag_omit_frame_pointer = 2;
10525
10526   /* If not optimizing for size, set the default
10527      alignment to what the target wants.  */
10528   if (!opts->x_optimize_size)
10529     {
10530       if (opts->x_align_loops <= 0)
10531         opts->x_align_loops = aarch64_tune_params.loop_align;
10532       if (opts->x_align_jumps <= 0)
10533         opts->x_align_jumps = aarch64_tune_params.jump_align;
10534       if (opts->x_align_functions <= 0)
10535         opts->x_align_functions = aarch64_tune_params.function_align;
10536     }
10537
10538   /* We default to no pc-relative literal loads.  */
10539
10540   aarch64_pcrelative_literal_loads = false;
10541
10542   /* If -mpc-relative-literal-loads is set on the command line, this
10543      implies that the user asked for PC relative literal loads.  */
10544   if (opts->x_pcrelative_literal_loads == 1)
10545     aarch64_pcrelative_literal_loads = true;
10546
10547   /* In the tiny memory model it makes no sense to disallow PC relative
10548      literal pool loads.  */
10549   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10550       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10551     aarch64_pcrelative_literal_loads = true;
10552
10553   /* When enabling the lower precision Newton series for the square root, also
10554      enable it for the reciprocal square root, since the latter is an
10555      intermediary step for the former.  */
10556   if (flag_mlow_precision_sqrt)
10557     flag_mrecip_low_precision_sqrt = true;
10558 }
10559
10560 /* 'Unpack' up the internal tuning structs and update the options
10561     in OPTS.  The caller must have set up selected_tune and selected_arch
10562     as all the other target-specific codegen decisions are
10563     derived from them.  */
10564
10565 void
10566 aarch64_override_options_internal (struct gcc_options *opts)
10567 {
10568   aarch64_tune_flags = selected_tune->flags;
10569   aarch64_tune = selected_tune->sched_core;
10570   /* Make a copy of the tuning parameters attached to the core, which
10571      we may later overwrite.  */
10572   aarch64_tune_params = *(selected_tune->tune);
10573   aarch64_architecture_version = selected_arch->architecture_version;
10574
10575   if (opts->x_aarch64_override_tune_string)
10576     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10577                                   &aarch64_tune_params);
10578
10579   /* This target defaults to strict volatile bitfields.  */
10580   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10581     opts->x_flag_strict_volatile_bitfields = 1;
10582
10583   initialize_aarch64_code_model (opts);
10584   initialize_aarch64_tls_size (opts);
10585
10586   int queue_depth = 0;
10587   switch (aarch64_tune_params.autoprefetcher_model)
10588     {
10589       case tune_params::AUTOPREFETCHER_OFF:
10590         queue_depth = -1;
10591         break;
10592       case tune_params::AUTOPREFETCHER_WEAK:
10593         queue_depth = 0;
10594         break;
10595       case tune_params::AUTOPREFETCHER_STRONG:
10596         queue_depth = max_insn_queue_index + 1;
10597         break;
10598       default:
10599         gcc_unreachable ();
10600     }
10601
10602   /* We don't mind passing in global_options_set here as we don't use
10603      the *options_set structs anyway.  */
10604   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10605                          queue_depth,
10606                          opts->x_param_values,
10607                          global_options_set.x_param_values);
10608
10609   /* Set up parameters to be used in prefetching algorithm.  Do not
10610      override the defaults unless we are tuning for a core we have
10611      researched values for.  */
10612   if (aarch64_tune_params.prefetch->num_slots > 0)
10613     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10614                            aarch64_tune_params.prefetch->num_slots,
10615                            opts->x_param_values,
10616                            global_options_set.x_param_values);
10617   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10618     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10619                            aarch64_tune_params.prefetch->l1_cache_size,
10620                            opts->x_param_values,
10621                            global_options_set.x_param_values);
10622   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10623     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10624                            aarch64_tune_params.prefetch->l1_cache_line_size,
10625                            opts->x_param_values,
10626                            global_options_set.x_param_values);
10627   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10628     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10629                            aarch64_tune_params.prefetch->l2_cache_size,
10630                            opts->x_param_values,
10631                            global_options_set.x_param_values);
10632
10633   /* Use the alternative scheduling-pressure algorithm by default.  */
10634   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10635                          opts->x_param_values,
10636                          global_options_set.x_param_values);
10637
10638   /* Enable sw prefetching at specified optimization level for
10639      CPUS that have prefetch.  Lower optimization level threshold by 1
10640      when profiling is enabled.  */
10641   if (opts->x_flag_prefetch_loop_arrays < 0
10642       && !opts->x_optimize_size
10643       && aarch64_tune_params.prefetch->default_opt_level >= 0
10644       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10645     opts->x_flag_prefetch_loop_arrays = 1;
10646
10647   aarch64_override_options_after_change_1 (opts);
10648 }
10649
10650 /* Print a hint with a suggestion for a core or architecture name that
10651    most closely resembles what the user passed in STR.  ARCH is true if
10652    the user is asking for an architecture name.  ARCH is false if the user
10653    is asking for a core name.  */
10654
10655 static void
10656 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10657 {
10658   auto_vec<const char *> candidates;
10659   const struct processor *entry = arch ? all_architectures : all_cores;
10660   for (; entry->name != NULL; entry++)
10661     candidates.safe_push (entry->name);
10662
10663 #ifdef HAVE_LOCAL_CPU_DETECT
10664   /* Add also "native" as possible value.  */
10665   if (arch)
10666     candidates.safe_push ("native");
10667 #endif
10668
10669   char *s;
10670   const char *hint = candidates_list_and_hint (str, s, candidates);
10671   if (hint)
10672     inform (input_location, "valid arguments are: %s;"
10673                              " did you mean %qs?", s, hint);
10674   else
10675     inform (input_location, "valid arguments are: %s", s);
10676
10677   XDELETEVEC (s);
10678 }
10679
10680 /* Print a hint with a suggestion for a core name that most closely resembles
10681    what the user passed in STR.  */
10682
10683 inline static void
10684 aarch64_print_hint_for_core (const char *str)
10685 {
10686   aarch64_print_hint_for_core_or_arch (str, false);
10687 }
10688
10689 /* Print a hint with a suggestion for an architecture name that most closely
10690    resembles what the user passed in STR.  */
10691
10692 inline static void
10693 aarch64_print_hint_for_arch (const char *str)
10694 {
10695   aarch64_print_hint_for_core_or_arch (str, true);
10696 }
10697
10698 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10699    specified in STR and throw errors if appropriate.  Put the results if
10700    they are valid in RES and ISA_FLAGS.  Return whether the option is
10701    valid.  */
10702
10703 static bool
10704 aarch64_validate_mcpu (const char *str, const struct processor **res,
10705                        unsigned long *isa_flags)
10706 {
10707   enum aarch64_parse_opt_result parse_res
10708     = aarch64_parse_cpu (str, res, isa_flags);
10709
10710   if (parse_res == AARCH64_PARSE_OK)
10711     return true;
10712
10713   switch (parse_res)
10714     {
10715       case AARCH64_PARSE_MISSING_ARG:
10716         error ("missing cpu name in %<-mcpu=%s%>", str);
10717         break;
10718       case AARCH64_PARSE_INVALID_ARG:
10719         error ("unknown value %qs for -mcpu", str);
10720         aarch64_print_hint_for_core (str);
10721         break;
10722       case AARCH64_PARSE_INVALID_FEATURE:
10723         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10724         break;
10725       default:
10726         gcc_unreachable ();
10727     }
10728
10729   return false;
10730 }
10731
10732 /* Validate a command-line -march option.  Parse the arch and extensions
10733    (if any) specified in STR and throw errors if appropriate.  Put the
10734    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10735    option is valid.  */
10736
10737 static bool
10738 aarch64_validate_march (const char *str, const struct processor **res,
10739                          unsigned long *isa_flags)
10740 {
10741   enum aarch64_parse_opt_result parse_res
10742     = aarch64_parse_arch (str, res, isa_flags);
10743
10744   if (parse_res == AARCH64_PARSE_OK)
10745     return true;
10746
10747   switch (parse_res)
10748     {
10749       case AARCH64_PARSE_MISSING_ARG:
10750         error ("missing arch name in %<-march=%s%>", str);
10751         break;
10752       case AARCH64_PARSE_INVALID_ARG:
10753         error ("unknown value %qs for -march", str);
10754         aarch64_print_hint_for_arch (str);
10755         break;
10756       case AARCH64_PARSE_INVALID_FEATURE:
10757         error ("invalid feature modifier in %<-march=%s%>", str);
10758         break;
10759       default:
10760         gcc_unreachable ();
10761     }
10762
10763   return false;
10764 }
10765
10766 /* Validate a command-line -mtune option.  Parse the cpu
10767    specified in STR and throw errors if appropriate.  Put the
10768    result, if it is valid, in RES.  Return whether the option is
10769    valid.  */
10770
10771 static bool
10772 aarch64_validate_mtune (const char *str, const struct processor **res)
10773 {
10774   enum aarch64_parse_opt_result parse_res
10775     = aarch64_parse_tune (str, res);
10776
10777   if (parse_res == AARCH64_PARSE_OK)
10778     return true;
10779
10780   switch (parse_res)
10781     {
10782       case AARCH64_PARSE_MISSING_ARG:
10783         error ("missing cpu name in %<-mtune=%s%>", str);
10784         break;
10785       case AARCH64_PARSE_INVALID_ARG:
10786         error ("unknown value %qs for -mtune", str);
10787         aarch64_print_hint_for_core (str);
10788         break;
10789       default:
10790         gcc_unreachable ();
10791     }
10792   return false;
10793 }
10794
10795 /* Return the CPU corresponding to the enum CPU.
10796    If it doesn't specify a cpu, return the default.  */
10797
10798 static const struct processor *
10799 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10800 {
10801   if (cpu != aarch64_none)
10802     return &all_cores[cpu];
10803
10804   /* The & 0x3f is to extract the bottom 6 bits that encode the
10805      default cpu as selected by the --with-cpu GCC configure option
10806      in config.gcc.
10807      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10808      flags mechanism should be reworked to make it more sane.  */
10809   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10810 }
10811
10812 /* Return the architecture corresponding to the enum ARCH.
10813    If it doesn't specify a valid architecture, return the default.  */
10814
10815 static const struct processor *
10816 aarch64_get_arch (enum aarch64_arch arch)
10817 {
10818   if (arch != aarch64_no_arch)
10819     return &all_architectures[arch];
10820
10821   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10822
10823   return &all_architectures[cpu->arch];
10824 }
10825
10826 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10827
10828 static poly_uint16
10829 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10830 {
10831   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10832      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10833      deciding which .md file patterns to use and when deciding whether
10834      something is a legitimate address or constant.  */
10835   if (value == SVE_SCALABLE || value == SVE_128)
10836     return poly_uint16 (2, 2);
10837   else
10838     return (int) value / 64;
10839 }
10840
10841 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10842    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10843    tuning structs.  In particular it must set selected_tune and
10844    aarch64_isa_flags that define the available ISA features and tuning
10845    decisions.  It must also set selected_arch as this will be used to
10846    output the .arch asm tags for each function.  */
10847
10848 static void
10849 aarch64_override_options (void)
10850 {
10851   unsigned long cpu_isa = 0;
10852   unsigned long arch_isa = 0;
10853   aarch64_isa_flags = 0;
10854
10855   bool valid_cpu = true;
10856   bool valid_tune = true;
10857   bool valid_arch = true;
10858
10859   selected_cpu = NULL;
10860   selected_arch = NULL;
10861   selected_tune = NULL;
10862
10863   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10864      If either of -march or -mtune is given, they override their
10865      respective component of -mcpu.  */
10866   if (aarch64_cpu_string)
10867     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10868                                         &cpu_isa);
10869
10870   if (aarch64_arch_string)
10871     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10872                                           &arch_isa);
10873
10874   if (aarch64_tune_string)
10875     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10876
10877   /* If the user did not specify a processor, choose the default
10878      one for them.  This will be the CPU set during configuration using
10879      --with-cpu, otherwise it is "generic".  */
10880   if (!selected_cpu)
10881     {
10882       if (selected_arch)
10883         {
10884           selected_cpu = &all_cores[selected_arch->ident];
10885           aarch64_isa_flags = arch_isa;
10886           explicit_arch = selected_arch->arch;
10887         }
10888       else
10889         {
10890           /* Get default configure-time CPU.  */
10891           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10892           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10893         }
10894
10895       if (selected_tune)
10896         explicit_tune_core = selected_tune->ident;
10897     }
10898   /* If both -mcpu and -march are specified check that they are architecturally
10899      compatible, warn if they're not and prefer the -march ISA flags.  */
10900   else if (selected_arch)
10901     {
10902       if (selected_arch->arch != selected_cpu->arch)
10903         {
10904           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10905                        all_architectures[selected_cpu->arch].name,
10906                        selected_arch->name);
10907         }
10908       aarch64_isa_flags = arch_isa;
10909       explicit_arch = selected_arch->arch;
10910       explicit_tune_core = selected_tune ? selected_tune->ident
10911                                           : selected_cpu->ident;
10912     }
10913   else
10914     {
10915       /* -mcpu but no -march.  */
10916       aarch64_isa_flags = cpu_isa;
10917       explicit_tune_core = selected_tune ? selected_tune->ident
10918                                           : selected_cpu->ident;
10919       gcc_assert (selected_cpu);
10920       selected_arch = &all_architectures[selected_cpu->arch];
10921       explicit_arch = selected_arch->arch;
10922     }
10923
10924   /* Set the arch as well as we will need it when outputing
10925      the .arch directive in assembly.  */
10926   if (!selected_arch)
10927     {
10928       gcc_assert (selected_cpu);
10929       selected_arch = &all_architectures[selected_cpu->arch];
10930     }
10931
10932   if (!selected_tune)
10933     selected_tune = selected_cpu;
10934
10935 #ifndef HAVE_AS_MABI_OPTION
10936   /* The compiler may have been configured with 2.23.* binutils, which does
10937      not have support for ILP32.  */
10938   if (TARGET_ILP32)
10939     error ("assembler does not support -mabi=ilp32");
10940 #endif
10941
10942   /* Convert -msve-vector-bits to a VG count.  */
10943   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10944
10945   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10946     sorry ("return address signing is only supported for -mabi=lp64");
10947
10948   /* Make sure we properly set up the explicit options.  */
10949   if ((aarch64_cpu_string && valid_cpu)
10950        || (aarch64_tune_string && valid_tune))
10951     gcc_assert (explicit_tune_core != aarch64_none);
10952
10953   if ((aarch64_cpu_string && valid_cpu)
10954        || (aarch64_arch_string && valid_arch))
10955     gcc_assert (explicit_arch != aarch64_no_arch);
10956
10957   aarch64_override_options_internal (&global_options);
10958
10959   /* Save these options as the default ones in case we push and pop them later
10960      while processing functions with potential target attributes.  */
10961   target_option_default_node = target_option_current_node
10962       = build_target_option_node (&global_options);
10963 }
10964
10965 /* Implement targetm.override_options_after_change.  */
10966
10967 static void
10968 aarch64_override_options_after_change (void)
10969 {
10970   aarch64_override_options_after_change_1 (&global_options);
10971 }
10972
10973 static struct machine_function *
10974 aarch64_init_machine_status (void)
10975 {
10976   struct machine_function *machine;
10977   machine = ggc_cleared_alloc<machine_function> ();
10978   return machine;
10979 }
10980
10981 void
10982 aarch64_init_expanders (void)
10983 {
10984   init_machine_status = aarch64_init_machine_status;
10985 }
10986
10987 /* A checking mechanism for the implementation of the various code models.  */
10988 static void
10989 initialize_aarch64_code_model (struct gcc_options *opts)
10990 {
10991    if (opts->x_flag_pic)
10992      {
10993        switch (opts->x_aarch64_cmodel_var)
10994          {
10995          case AARCH64_CMODEL_TINY:
10996            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10997            break;
10998          case AARCH64_CMODEL_SMALL:
10999 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11000            aarch64_cmodel = (flag_pic == 2
11001                              ? AARCH64_CMODEL_SMALL_PIC
11002                              : AARCH64_CMODEL_SMALL_SPIC);
11003 #else
11004            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11005 #endif
11006            break;
11007          case AARCH64_CMODEL_LARGE:
11008            sorry ("code model %qs with -f%s", "large",
11009                   opts->x_flag_pic > 1 ? "PIC" : "pic");
11010            break;
11011          default:
11012            gcc_unreachable ();
11013          }
11014      }
11015    else
11016      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11017 }
11018
11019 /* Implement TARGET_OPTION_SAVE.  */
11020
11021 static void
11022 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11023 {
11024   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11025 }
11026
11027 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11028    using the information saved in PTR.  */
11029
11030 static void
11031 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11032 {
11033   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11034   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11035   opts->x_explicit_arch = ptr->x_explicit_arch;
11036   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11037   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11038
11039   aarch64_override_options_internal (opts);
11040 }
11041
11042 /* Implement TARGET_OPTION_PRINT.  */
11043
11044 static void
11045 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11046 {
11047   const struct processor *cpu
11048     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11049   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11050   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11051   std::string extension
11052     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11053
11054   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11055   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11056            arch->name, extension.c_str ());
11057 }
11058
11059 static GTY(()) tree aarch64_previous_fndecl;
11060
11061 void
11062 aarch64_reset_previous_fndecl (void)
11063 {
11064   aarch64_previous_fndecl = NULL;
11065 }
11066
11067 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11068    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11069    make sure optab availability predicates are recomputed when necessary.  */
11070
11071 void
11072 aarch64_save_restore_target_globals (tree new_tree)
11073 {
11074   if (TREE_TARGET_GLOBALS (new_tree))
11075     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11076   else if (new_tree == target_option_default_node)
11077     restore_target_globals (&default_target_globals);
11078   else
11079     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11080 }
11081
11082 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11083    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11084    of the function, if such exists.  This function may be called multiple
11085    times on a single function so use aarch64_previous_fndecl to avoid
11086    setting up identical state.  */
11087
11088 static void
11089 aarch64_set_current_function (tree fndecl)
11090 {
11091   if (!fndecl || fndecl == aarch64_previous_fndecl)
11092     return;
11093
11094   tree old_tree = (aarch64_previous_fndecl
11095                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11096                    : NULL_TREE);
11097
11098   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11099
11100   /* If current function has no attributes but the previous one did,
11101      use the default node.  */
11102   if (!new_tree && old_tree)
11103     new_tree = target_option_default_node;
11104
11105   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11106      the default have been handled by aarch64_save_restore_target_globals from
11107      aarch64_pragma_target_parse.  */
11108   if (old_tree == new_tree)
11109     return;
11110
11111   aarch64_previous_fndecl = fndecl;
11112
11113   /* First set the target options.  */
11114   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11115
11116   aarch64_save_restore_target_globals (new_tree);
11117 }
11118
11119 /* Enum describing the various ways we can handle attributes.
11120    In many cases we can reuse the generic option handling machinery.  */
11121
11122 enum aarch64_attr_opt_type
11123 {
11124   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11125   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11126   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11127   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11128 };
11129
11130 /* All the information needed to handle a target attribute.
11131    NAME is the name of the attribute.
11132    ATTR_TYPE specifies the type of behavior of the attribute as described
11133    in the definition of enum aarch64_attr_opt_type.
11134    ALLOW_NEG is true if the attribute supports a "no-" form.
11135    HANDLER is the function that takes the attribute string as an argument
11136    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11137    OPT_NUM is the enum specifying the option that the attribute modifies.
11138    This is needed for attributes that mirror the behavior of a command-line
11139    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11140    aarch64_attr_enum.  */
11141
11142 struct aarch64_attribute_info
11143 {
11144   const char *name;
11145   enum aarch64_attr_opt_type attr_type;
11146   bool allow_neg;
11147   bool (*handler) (const char *);
11148   enum opt_code opt_num;
11149 };
11150
11151 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11152
11153 static bool
11154 aarch64_handle_attr_arch (const char *str)
11155 {
11156   const struct processor *tmp_arch = NULL;
11157   enum aarch64_parse_opt_result parse_res
11158     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11159
11160   if (parse_res == AARCH64_PARSE_OK)
11161     {
11162       gcc_assert (tmp_arch);
11163       selected_arch = tmp_arch;
11164       explicit_arch = selected_arch->arch;
11165       return true;
11166     }
11167
11168   switch (parse_res)
11169     {
11170       case AARCH64_PARSE_MISSING_ARG:
11171         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11172         break;
11173       case AARCH64_PARSE_INVALID_ARG:
11174         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11175         aarch64_print_hint_for_arch (str);
11176         break;
11177       case AARCH64_PARSE_INVALID_FEATURE:
11178         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11179         break;
11180       default:
11181         gcc_unreachable ();
11182     }
11183
11184   return false;
11185 }
11186
11187 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11188
11189 static bool
11190 aarch64_handle_attr_cpu (const char *str)
11191 {
11192   const struct processor *tmp_cpu = NULL;
11193   enum aarch64_parse_opt_result parse_res
11194     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11195
11196   if (parse_res == AARCH64_PARSE_OK)
11197     {
11198       gcc_assert (tmp_cpu);
11199       selected_tune = tmp_cpu;
11200       explicit_tune_core = selected_tune->ident;
11201
11202       selected_arch = &all_architectures[tmp_cpu->arch];
11203       explicit_arch = selected_arch->arch;
11204       return true;
11205     }
11206
11207   switch (parse_res)
11208     {
11209       case AARCH64_PARSE_MISSING_ARG:
11210         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11211         break;
11212       case AARCH64_PARSE_INVALID_ARG:
11213         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11214         aarch64_print_hint_for_core (str);
11215         break;
11216       case AARCH64_PARSE_INVALID_FEATURE:
11217         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11218         break;
11219       default:
11220         gcc_unreachable ();
11221     }
11222
11223   return false;
11224 }
11225
11226 /* Handle the argument STR to the tune= target attribute.  */
11227
11228 static bool
11229 aarch64_handle_attr_tune (const char *str)
11230 {
11231   const struct processor *tmp_tune = NULL;
11232   enum aarch64_parse_opt_result parse_res
11233     = aarch64_parse_tune (str, &tmp_tune);
11234
11235   if (parse_res == AARCH64_PARSE_OK)
11236     {
11237       gcc_assert (tmp_tune);
11238       selected_tune = tmp_tune;
11239       explicit_tune_core = selected_tune->ident;
11240       return true;
11241     }
11242
11243   switch (parse_res)
11244     {
11245       case AARCH64_PARSE_INVALID_ARG:
11246         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11247         aarch64_print_hint_for_core (str);
11248         break;
11249       default:
11250         gcc_unreachable ();
11251     }
11252
11253   return false;
11254 }
11255
11256 /* Parse an architecture extensions target attribute string specified in STR.
11257    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11258    if successful.  Update aarch64_isa_flags to reflect the ISA features
11259    modified.  */
11260
11261 static bool
11262 aarch64_handle_attr_isa_flags (char *str)
11263 {
11264   enum aarch64_parse_opt_result parse_res;
11265   unsigned long isa_flags = aarch64_isa_flags;
11266
11267   /* We allow "+nothing" in the beginning to clear out all architectural
11268      features if the user wants to handpick specific features.  */
11269   if (strncmp ("+nothing", str, 8) == 0)
11270     {
11271       isa_flags = 0;
11272       str += 8;
11273     }
11274
11275   parse_res = aarch64_parse_extension (str, &isa_flags);
11276
11277   if (parse_res == AARCH64_PARSE_OK)
11278     {
11279       aarch64_isa_flags = isa_flags;
11280       return true;
11281     }
11282
11283   switch (parse_res)
11284     {
11285       case AARCH64_PARSE_MISSING_ARG:
11286         error ("missing value in %<target()%> pragma or attribute");
11287         break;
11288
11289       case AARCH64_PARSE_INVALID_FEATURE:
11290         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11291         break;
11292
11293       default:
11294         gcc_unreachable ();
11295     }
11296
11297  return false;
11298 }
11299
11300 /* The target attributes that we support.  On top of these we also support just
11301    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11302    handled explicitly in aarch64_process_one_target_attr.  */
11303
11304 static const struct aarch64_attribute_info aarch64_attributes[] =
11305 {
11306   { "general-regs-only", aarch64_attr_mask, false, NULL,
11307      OPT_mgeneral_regs_only },
11308   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11309      OPT_mfix_cortex_a53_835769 },
11310   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11311      OPT_mfix_cortex_a53_843419 },
11312   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11313   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11314   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11315      OPT_momit_leaf_frame_pointer },
11316   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11317   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11318      OPT_march_ },
11319   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11320   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11321      OPT_mtune_ },
11322   { "sign-return-address", aarch64_attr_enum, false, NULL,
11323      OPT_msign_return_address_ },
11324   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11325 };
11326
11327 /* Parse ARG_STR which contains the definition of one target attribute.
11328    Show appropriate errors if any or return true if the attribute is valid.  */
11329
11330 static bool
11331 aarch64_process_one_target_attr (char *arg_str)
11332 {
11333   bool invert = false;
11334
11335   size_t len = strlen (arg_str);
11336
11337   if (len == 0)
11338     {
11339       error ("malformed %<target()%> pragma or attribute");
11340       return false;
11341     }
11342
11343   char *str_to_check = (char *) alloca (len + 1);
11344   strcpy (str_to_check, arg_str);
11345
11346   /* Skip leading whitespace.  */
11347   while (*str_to_check == ' ' || *str_to_check == '\t')
11348     str_to_check++;
11349
11350   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11351      It is easier to detect and handle it explicitly here rather than going
11352      through the machinery for the rest of the target attributes in this
11353      function.  */
11354   if (*str_to_check == '+')
11355     return aarch64_handle_attr_isa_flags (str_to_check);
11356
11357   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11358     {
11359       invert = true;
11360       str_to_check += 3;
11361     }
11362   char *arg = strchr (str_to_check, '=');
11363
11364   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11365      and point ARG to "foo".  */
11366   if (arg)
11367     {
11368       *arg = '\0';
11369       arg++;
11370     }
11371   const struct aarch64_attribute_info *p_attr;
11372   bool found = false;
11373   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11374     {
11375       /* If the names don't match up, or the user has given an argument
11376          to an attribute that doesn't accept one, or didn't give an argument
11377          to an attribute that expects one, fail to match.  */
11378       if (strcmp (str_to_check, p_attr->name) != 0)
11379         continue;
11380
11381       found = true;
11382       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11383                               || p_attr->attr_type == aarch64_attr_enum;
11384
11385       if (attr_need_arg_p ^ (arg != NULL))
11386         {
11387           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11388           return false;
11389         }
11390
11391       /* If the name matches but the attribute does not allow "no-" versions
11392          then we can't match.  */
11393       if (invert && !p_attr->allow_neg)
11394         {
11395           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11396           return false;
11397         }
11398
11399       switch (p_attr->attr_type)
11400         {
11401         /* Has a custom handler registered.
11402            For example, cpu=, arch=, tune=.  */
11403           case aarch64_attr_custom:
11404             gcc_assert (p_attr->handler);
11405             if (!p_attr->handler (arg))
11406               return false;
11407             break;
11408
11409           /* Either set or unset a boolean option.  */
11410           case aarch64_attr_bool:
11411             {
11412               struct cl_decoded_option decoded;
11413
11414               generate_option (p_attr->opt_num, NULL, !invert,
11415                                CL_TARGET, &decoded);
11416               aarch64_handle_option (&global_options, &global_options_set,
11417                                       &decoded, input_location);
11418               break;
11419             }
11420           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11421              should know what mask to apply given the option number.  */
11422           case aarch64_attr_mask:
11423             {
11424               struct cl_decoded_option decoded;
11425               /* We only need to specify the option number.
11426                  aarch64_handle_option will know which mask to apply.  */
11427               decoded.opt_index = p_attr->opt_num;
11428               decoded.value = !invert;
11429               aarch64_handle_option (&global_options, &global_options_set,
11430                                       &decoded, input_location);
11431               break;
11432             }
11433           /* Use the option setting machinery to set an option to an enum.  */
11434           case aarch64_attr_enum:
11435             {
11436               gcc_assert (arg);
11437               bool valid;
11438               int value;
11439               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11440                                               &value, CL_TARGET);
11441               if (valid)
11442                 {
11443                   set_option (&global_options, NULL, p_attr->opt_num, value,
11444                               NULL, DK_UNSPECIFIED, input_location,
11445                               global_dc);
11446                 }
11447               else
11448                 {
11449                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11450                 }
11451               break;
11452             }
11453           default:
11454             gcc_unreachable ();
11455         }
11456     }
11457
11458   /* If we reached here we either have found an attribute and validated
11459      it or didn't match any.  If we matched an attribute but its arguments
11460      were malformed we will have returned false already.  */
11461   return found;
11462 }
11463
11464 /* Count how many times the character C appears in
11465    NULL-terminated string STR.  */
11466
11467 static unsigned int
11468 num_occurences_in_str (char c, char *str)
11469 {
11470   unsigned int res = 0;
11471   while (*str != '\0')
11472     {
11473       if (*str == c)
11474         res++;
11475
11476       str++;
11477     }
11478
11479   return res;
11480 }
11481
11482 /* Parse the tree in ARGS that contains the target attribute information
11483    and update the global target options space.  */
11484
11485 bool
11486 aarch64_process_target_attr (tree args)
11487 {
11488   if (TREE_CODE (args) == TREE_LIST)
11489     {
11490       do
11491         {
11492           tree head = TREE_VALUE (args);
11493           if (head)
11494             {
11495               if (!aarch64_process_target_attr (head))
11496                 return false;
11497             }
11498           args = TREE_CHAIN (args);
11499         } while (args);
11500
11501       return true;
11502     }
11503
11504   if (TREE_CODE (args) != STRING_CST)
11505     {
11506       error ("attribute %<target%> argument not a string");
11507       return false;
11508     }
11509
11510   size_t len = strlen (TREE_STRING_POINTER (args));
11511   char *str_to_check = (char *) alloca (len + 1);
11512   strcpy (str_to_check, TREE_STRING_POINTER (args));
11513
11514   if (len == 0)
11515     {
11516       error ("malformed %<target()%> pragma or attribute");
11517       return false;
11518     }
11519
11520   /* Used to catch empty spaces between commas i.e.
11521      attribute ((target ("attr1,,attr2"))).  */
11522   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11523
11524   /* Handle multiple target attributes separated by ','.  */
11525   char *token = strtok (str_to_check, ",");
11526
11527   unsigned int num_attrs = 0;
11528   while (token)
11529     {
11530       num_attrs++;
11531       if (!aarch64_process_one_target_attr (token))
11532         {
11533           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11534           return false;
11535         }
11536
11537       token = strtok (NULL, ",");
11538     }
11539
11540   if (num_attrs != num_commas + 1)
11541     {
11542       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11543       return false;
11544     }
11545
11546   return true;
11547 }
11548
11549 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11550    process attribute ((target ("..."))).  */
11551
11552 static bool
11553 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11554 {
11555   struct cl_target_option cur_target;
11556   bool ret;
11557   tree old_optimize;
11558   tree new_target, new_optimize;
11559   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11560
11561   /* If what we're processing is the current pragma string then the
11562      target option node is already stored in target_option_current_node
11563      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11564      having to re-parse the string.  This is especially useful to keep
11565      arm_neon.h compile times down since that header contains a lot
11566      of intrinsics enclosed in pragmas.  */
11567   if (!existing_target && args == current_target_pragma)
11568     {
11569       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11570       return true;
11571     }
11572   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11573
11574   old_optimize = build_optimization_node (&global_options);
11575   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11576
11577   /* If the function changed the optimization levels as well as setting
11578      target options, start with the optimizations specified.  */
11579   if (func_optimize && func_optimize != old_optimize)
11580     cl_optimization_restore (&global_options,
11581                              TREE_OPTIMIZATION (func_optimize));
11582
11583   /* Save the current target options to restore at the end.  */
11584   cl_target_option_save (&cur_target, &global_options);
11585
11586   /* If fndecl already has some target attributes applied to it, unpack
11587      them so that we add this attribute on top of them, rather than
11588      overwriting them.  */
11589   if (existing_target)
11590     {
11591       struct cl_target_option *existing_options
11592         = TREE_TARGET_OPTION (existing_target);
11593
11594       if (existing_options)
11595         cl_target_option_restore (&global_options, existing_options);
11596     }
11597   else
11598     cl_target_option_restore (&global_options,
11599                         TREE_TARGET_OPTION (target_option_current_node));
11600
11601   ret = aarch64_process_target_attr (args);
11602
11603   /* Set up any additional state.  */
11604   if (ret)
11605     {
11606       aarch64_override_options_internal (&global_options);
11607       /* Initialize SIMD builtins if we haven't already.
11608          Set current_target_pragma to NULL for the duration so that
11609          the builtin initialization code doesn't try to tag the functions
11610          being built with the attributes specified by any current pragma, thus
11611          going into an infinite recursion.  */
11612       if (TARGET_SIMD)
11613         {
11614           tree saved_current_target_pragma = current_target_pragma;
11615           current_target_pragma = NULL;
11616           aarch64_init_simd_builtins ();
11617           current_target_pragma = saved_current_target_pragma;
11618         }
11619       new_target = build_target_option_node (&global_options);
11620     }
11621   else
11622     new_target = NULL;
11623
11624   new_optimize = build_optimization_node (&global_options);
11625
11626   if (fndecl && ret)
11627     {
11628       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11629
11630       if (old_optimize != new_optimize)
11631         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11632     }
11633
11634   cl_target_option_restore (&global_options, &cur_target);
11635
11636   if (old_optimize != new_optimize)
11637     cl_optimization_restore (&global_options,
11638                              TREE_OPTIMIZATION (old_optimize));
11639   return ret;
11640 }
11641
11642 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11643    tri-bool options (yes, no, don't care) and the default value is
11644    DEF, determine whether to reject inlining.  */
11645
11646 static bool
11647 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11648                                      int dont_care, int def)
11649 {
11650   /* If the callee doesn't care, always allow inlining.  */
11651   if (callee == dont_care)
11652     return true;
11653
11654   /* If the caller doesn't care, always allow inlining.  */
11655   if (caller == dont_care)
11656     return true;
11657
11658   /* Otherwise, allow inlining if either the callee and caller values
11659      agree, or if the callee is using the default value.  */
11660   return (callee == caller || callee == def);
11661 }
11662
11663 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11664    to inline CALLEE into CALLER based on target-specific info.
11665    Make sure that the caller and callee have compatible architectural
11666    features.  Then go through the other possible target attributes
11667    and see if they can block inlining.  Try not to reject always_inline
11668    callees unless they are incompatible architecturally.  */
11669
11670 static bool
11671 aarch64_can_inline_p (tree caller, tree callee)
11672 {
11673   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11674   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11675
11676   struct cl_target_option *caller_opts
11677         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11678                                            : target_option_default_node);
11679
11680   struct cl_target_option *callee_opts
11681         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11682                                            : target_option_default_node);
11683
11684   /* Callee's ISA flags should be a subset of the caller's.  */
11685   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11686        != callee_opts->x_aarch64_isa_flags)
11687     return false;
11688
11689   /* Allow non-strict aligned functions inlining into strict
11690      aligned ones.  */
11691   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11692        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11693       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11694            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11695     return false;
11696
11697   bool always_inline = lookup_attribute ("always_inline",
11698                                           DECL_ATTRIBUTES (callee));
11699
11700   /* If the architectural features match up and the callee is always_inline
11701      then the other attributes don't matter.  */
11702   if (always_inline)
11703     return true;
11704
11705   if (caller_opts->x_aarch64_cmodel_var
11706       != callee_opts->x_aarch64_cmodel_var)
11707     return false;
11708
11709   if (caller_opts->x_aarch64_tls_dialect
11710       != callee_opts->x_aarch64_tls_dialect)
11711     return false;
11712
11713   /* Honour explicit requests to workaround errata.  */
11714   if (!aarch64_tribools_ok_for_inlining_p (
11715           caller_opts->x_aarch64_fix_a53_err835769,
11716           callee_opts->x_aarch64_fix_a53_err835769,
11717           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11718     return false;
11719
11720   if (!aarch64_tribools_ok_for_inlining_p (
11721           caller_opts->x_aarch64_fix_a53_err843419,
11722           callee_opts->x_aarch64_fix_a53_err843419,
11723           2, TARGET_FIX_ERR_A53_843419))
11724     return false;
11725
11726   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11727      caller and calle and they don't match up, reject inlining.  */
11728   if (!aarch64_tribools_ok_for_inlining_p (
11729           caller_opts->x_flag_omit_leaf_frame_pointer,
11730           callee_opts->x_flag_omit_leaf_frame_pointer,
11731           2, 1))
11732     return false;
11733
11734   /* If the callee has specific tuning overrides, respect them.  */
11735   if (callee_opts->x_aarch64_override_tune_string != NULL
11736       && caller_opts->x_aarch64_override_tune_string == NULL)
11737     return false;
11738
11739   /* If the user specified tuning override strings for the
11740      caller and callee and they don't match up, reject inlining.
11741      We just do a string compare here, we don't analyze the meaning
11742      of the string, as it would be too costly for little gain.  */
11743   if (callee_opts->x_aarch64_override_tune_string
11744       && caller_opts->x_aarch64_override_tune_string
11745       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11746                   caller_opts->x_aarch64_override_tune_string) != 0))
11747     return false;
11748
11749   return true;
11750 }
11751
11752 /* Return true if SYMBOL_REF X binds locally.  */
11753
11754 static bool
11755 aarch64_symbol_binds_local_p (const_rtx x)
11756 {
11757   return (SYMBOL_REF_DECL (x)
11758           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11759           : SYMBOL_REF_LOCAL_P (x));
11760 }
11761
11762 /* Return true if SYMBOL_REF X is thread local */
11763 static bool
11764 aarch64_tls_symbol_p (rtx x)
11765 {
11766   if (! TARGET_HAVE_TLS)
11767     return false;
11768
11769   if (GET_CODE (x) != SYMBOL_REF)
11770     return false;
11771
11772   return SYMBOL_REF_TLS_MODEL (x) != 0;
11773 }
11774
11775 /* Classify a TLS symbol into one of the TLS kinds.  */
11776 enum aarch64_symbol_type
11777 aarch64_classify_tls_symbol (rtx x)
11778 {
11779   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11780
11781   switch (tls_kind)
11782     {
11783     case TLS_MODEL_GLOBAL_DYNAMIC:
11784     case TLS_MODEL_LOCAL_DYNAMIC:
11785       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11786
11787     case TLS_MODEL_INITIAL_EXEC:
11788       switch (aarch64_cmodel)
11789         {
11790         case AARCH64_CMODEL_TINY:
11791         case AARCH64_CMODEL_TINY_PIC:
11792           return SYMBOL_TINY_TLSIE;
11793         default:
11794           return SYMBOL_SMALL_TLSIE;
11795         }
11796
11797     case TLS_MODEL_LOCAL_EXEC:
11798       if (aarch64_tls_size == 12)
11799         return SYMBOL_TLSLE12;
11800       else if (aarch64_tls_size == 24)
11801         return SYMBOL_TLSLE24;
11802       else if (aarch64_tls_size == 32)
11803         return SYMBOL_TLSLE32;
11804       else if (aarch64_tls_size == 48)
11805         return SYMBOL_TLSLE48;
11806       else
11807         gcc_unreachable ();
11808
11809     case TLS_MODEL_EMULATED:
11810     case TLS_MODEL_NONE:
11811       return SYMBOL_FORCE_TO_MEM;
11812
11813     default:
11814       gcc_unreachable ();
11815     }
11816 }
11817
11818 /* Return the correct method for accessing X + OFFSET, where X is either
11819    a SYMBOL_REF or LABEL_REF.  */
11820
11821 enum aarch64_symbol_type
11822 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11823 {
11824   if (GET_CODE (x) == LABEL_REF)
11825     {
11826       switch (aarch64_cmodel)
11827         {
11828         case AARCH64_CMODEL_LARGE:
11829           return SYMBOL_FORCE_TO_MEM;
11830
11831         case AARCH64_CMODEL_TINY_PIC:
11832         case AARCH64_CMODEL_TINY:
11833           return SYMBOL_TINY_ABSOLUTE;
11834
11835         case AARCH64_CMODEL_SMALL_SPIC:
11836         case AARCH64_CMODEL_SMALL_PIC:
11837         case AARCH64_CMODEL_SMALL:
11838           return SYMBOL_SMALL_ABSOLUTE;
11839
11840         default:
11841           gcc_unreachable ();
11842         }
11843     }
11844
11845   if (GET_CODE (x) == SYMBOL_REF)
11846     {
11847       if (aarch64_tls_symbol_p (x))
11848         return aarch64_classify_tls_symbol (x);
11849
11850       switch (aarch64_cmodel)
11851         {
11852         case AARCH64_CMODEL_TINY:
11853           /* When we retrieve symbol + offset address, we have to make sure
11854              the offset does not cause overflow of the final address.  But
11855              we have no way of knowing the address of symbol at compile time
11856              so we can't accurately say if the distance between the PC and
11857              symbol + offset is outside the addressible range of +/-1M in the
11858              TINY code model.  So we rely on images not being greater than
11859              1M and cap the offset at 1M and anything beyond 1M will have to
11860              be loaded using an alternative mechanism.  Furthermore if the
11861              symbol is a weak reference to something that isn't known to
11862              resolve to a symbol in this module, then force to memory.  */
11863           if ((SYMBOL_REF_WEAK (x)
11864                && !aarch64_symbol_binds_local_p (x))
11865               || !IN_RANGE (offset, -1048575, 1048575))
11866             return SYMBOL_FORCE_TO_MEM;
11867           return SYMBOL_TINY_ABSOLUTE;
11868
11869         case AARCH64_CMODEL_SMALL:
11870           /* Same reasoning as the tiny code model, but the offset cap here is
11871              4G.  */
11872           if ((SYMBOL_REF_WEAK (x)
11873                && !aarch64_symbol_binds_local_p (x))
11874               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11875                             HOST_WIDE_INT_C (4294967264)))
11876             return SYMBOL_FORCE_TO_MEM;
11877           return SYMBOL_SMALL_ABSOLUTE;
11878
11879         case AARCH64_CMODEL_TINY_PIC:
11880           if (!aarch64_symbol_binds_local_p (x))
11881             return SYMBOL_TINY_GOT;
11882           return SYMBOL_TINY_ABSOLUTE;
11883
11884         case AARCH64_CMODEL_SMALL_SPIC:
11885         case AARCH64_CMODEL_SMALL_PIC:
11886           if (!aarch64_symbol_binds_local_p (x))
11887             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11888                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11889           return SYMBOL_SMALL_ABSOLUTE;
11890
11891         case AARCH64_CMODEL_LARGE:
11892           /* This is alright even in PIC code as the constant
11893              pool reference is always PC relative and within
11894              the same translation unit.  */
11895           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11896             return SYMBOL_SMALL_ABSOLUTE;
11897           else
11898             return SYMBOL_FORCE_TO_MEM;
11899
11900         default:
11901           gcc_unreachable ();
11902         }
11903     }
11904
11905   /* By default push everything into the constant pool.  */
11906   return SYMBOL_FORCE_TO_MEM;
11907 }
11908
11909 bool
11910 aarch64_constant_address_p (rtx x)
11911 {
11912   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11913 }
11914
11915 bool
11916 aarch64_legitimate_pic_operand_p (rtx x)
11917 {
11918   if (GET_CODE (x) == SYMBOL_REF
11919       || (GET_CODE (x) == CONST
11920           && GET_CODE (XEXP (x, 0)) == PLUS
11921           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11922      return false;
11923
11924   return true;
11925 }
11926
11927 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11928    that should be rematerialized rather than spilled.  */
11929
11930 static bool
11931 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11932 {
11933   /* Support CSE and rematerialization of common constants.  */
11934   if (CONST_INT_P (x)
11935       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11936       || GET_CODE (x) == CONST_VECTOR)
11937     return true;
11938
11939   /* Do not allow vector struct mode constants for Advanced SIMD.
11940      We could support 0 and -1 easily, but they need support in
11941      aarch64-simd.md.  */
11942   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11943   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11944     return false;
11945
11946   /* Only accept variable-length vector constants if they can be
11947      handled directly.
11948
11949      ??? It would be possible to handle rematerialization of other
11950      constants via secondary reloads.  */
11951   if (vec_flags & VEC_ANY_SVE)
11952     return aarch64_simd_valid_immediate (x, NULL);
11953
11954   if (GET_CODE (x) == HIGH)
11955     x = XEXP (x, 0);
11956
11957   /* Accept polynomial constants that can be calculated by using the
11958      destination of a move as the sole temporary.  Constants that
11959      require a second temporary cannot be rematerialized (they can't be
11960      forced to memory and also aren't legitimate constants).  */
11961   poly_int64 offset;
11962   if (poly_int_rtx_p (x, &offset))
11963     return aarch64_offset_temporaries (false, offset) <= 1;
11964
11965   /* If an offset is being added to something else, we need to allow the
11966      base to be moved into the destination register, meaning that there
11967      are no free temporaries for the offset.  */
11968   x = strip_offset (x, &offset);
11969   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11970     return false;
11971
11972   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11973   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11974     return false;
11975
11976   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11977      so spilling them is better than rematerialization.  */
11978   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11979     return true;
11980
11981   /* Label references are always constant.  */
11982   if (GET_CODE (x) == LABEL_REF)
11983     return true;
11984
11985   return false;
11986 }
11987
11988 rtx
11989 aarch64_load_tp (rtx target)
11990 {
11991   if (!target
11992       || GET_MODE (target) != Pmode
11993       || !register_operand (target, Pmode))
11994     target = gen_reg_rtx (Pmode);
11995
11996   /* Can return in any reg.  */
11997   emit_insn (gen_aarch64_load_tp_hard (target));
11998   return target;
11999 }
12000
12001 /* On AAPCS systems, this is the "struct __va_list".  */
12002 static GTY(()) tree va_list_type;
12003
12004 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12005    Return the type to use as __builtin_va_list.
12006
12007    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12008
12009    struct __va_list
12010    {
12011      void *__stack;
12012      void *__gr_top;
12013      void *__vr_top;
12014      int   __gr_offs;
12015      int   __vr_offs;
12016    };  */
12017
12018 static tree
12019 aarch64_build_builtin_va_list (void)
12020 {
12021   tree va_list_name;
12022   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12023
12024   /* Create the type.  */
12025   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12026   /* Give it the required name.  */
12027   va_list_name = build_decl (BUILTINS_LOCATION,
12028                              TYPE_DECL,
12029                              get_identifier ("__va_list"),
12030                              va_list_type);
12031   DECL_ARTIFICIAL (va_list_name) = 1;
12032   TYPE_NAME (va_list_type) = va_list_name;
12033   TYPE_STUB_DECL (va_list_type) = va_list_name;
12034
12035   /* Create the fields.  */
12036   f_stack = build_decl (BUILTINS_LOCATION,
12037                         FIELD_DECL, get_identifier ("__stack"),
12038                         ptr_type_node);
12039   f_grtop = build_decl (BUILTINS_LOCATION,
12040                         FIELD_DECL, get_identifier ("__gr_top"),
12041                         ptr_type_node);
12042   f_vrtop = build_decl (BUILTINS_LOCATION,
12043                         FIELD_DECL, get_identifier ("__vr_top"),
12044                         ptr_type_node);
12045   f_groff = build_decl (BUILTINS_LOCATION,
12046                         FIELD_DECL, get_identifier ("__gr_offs"),
12047                         integer_type_node);
12048   f_vroff = build_decl (BUILTINS_LOCATION,
12049                         FIELD_DECL, get_identifier ("__vr_offs"),
12050                         integer_type_node);
12051
12052   /* Tell tree-stdarg pass about our internal offset fields.
12053      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12054      purpose to identify whether the code is updating va_list internal
12055      offset fields through irregular way.  */
12056   va_list_gpr_counter_field = f_groff;
12057   va_list_fpr_counter_field = f_vroff;
12058
12059   DECL_ARTIFICIAL (f_stack) = 1;
12060   DECL_ARTIFICIAL (f_grtop) = 1;
12061   DECL_ARTIFICIAL (f_vrtop) = 1;
12062   DECL_ARTIFICIAL (f_groff) = 1;
12063   DECL_ARTIFICIAL (f_vroff) = 1;
12064
12065   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12066   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12067   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12068   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12069   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12070
12071   TYPE_FIELDS (va_list_type) = f_stack;
12072   DECL_CHAIN (f_stack) = f_grtop;
12073   DECL_CHAIN (f_grtop) = f_vrtop;
12074   DECL_CHAIN (f_vrtop) = f_groff;
12075   DECL_CHAIN (f_groff) = f_vroff;
12076
12077   /* Compute its layout.  */
12078   layout_type (va_list_type);
12079
12080   return va_list_type;
12081 }
12082
12083 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12084 static void
12085 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12086 {
12087   const CUMULATIVE_ARGS *cum;
12088   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12089   tree stack, grtop, vrtop, groff, vroff;
12090   tree t;
12091   int gr_save_area_size = cfun->va_list_gpr_size;
12092   int vr_save_area_size = cfun->va_list_fpr_size;
12093   int vr_offset;
12094
12095   cum = &crtl->args.info;
12096   if (cfun->va_list_gpr_size)
12097     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12098                              cfun->va_list_gpr_size);
12099   if (cfun->va_list_fpr_size)
12100     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12101                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12102
12103   if (!TARGET_FLOAT)
12104     {
12105       gcc_assert (cum->aapcs_nvrn == 0);
12106       vr_save_area_size = 0;
12107     }
12108
12109   f_stack = TYPE_FIELDS (va_list_type_node);
12110   f_grtop = DECL_CHAIN (f_stack);
12111   f_vrtop = DECL_CHAIN (f_grtop);
12112   f_groff = DECL_CHAIN (f_vrtop);
12113   f_vroff = DECL_CHAIN (f_groff);
12114
12115   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12116                   NULL_TREE);
12117   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12118                   NULL_TREE);
12119   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12120                   NULL_TREE);
12121   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12122                   NULL_TREE);
12123   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12124                   NULL_TREE);
12125
12126   /* Emit code to initialize STACK, which points to the next varargs stack
12127      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12128      by named arguments.  STACK is 8-byte aligned.  */
12129   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12130   if (cum->aapcs_stack_size > 0)
12131     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12132   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12133   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12134
12135   /* Emit code to initialize GRTOP, the top of the GR save area.
12136      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12137   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12138   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12139   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12140
12141   /* Emit code to initialize VRTOP, the top of the VR save area.
12142      This address is gr_save_area_bytes below GRTOP, rounded
12143      down to the next 16-byte boundary.  */
12144   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12145   vr_offset = ROUND_UP (gr_save_area_size,
12146                         STACK_BOUNDARY / BITS_PER_UNIT);
12147
12148   if (vr_offset)
12149     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12150   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12151   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12152
12153   /* Emit code to initialize GROFF, the offset from GRTOP of the
12154      next GPR argument.  */
12155   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12156               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12157   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12158
12159   /* Likewise emit code to initialize VROFF, the offset from FTOP
12160      of the next VR argument.  */
12161   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12162               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12163   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12164 }
12165
12166 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12167
12168 static tree
12169 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12170                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12171 {
12172   tree addr;
12173   bool indirect_p;
12174   bool is_ha;           /* is HFA or HVA.  */
12175   bool dw_align;        /* double-word align.  */
12176   machine_mode ag_mode = VOIDmode;
12177   int nregs;
12178   machine_mode mode;
12179
12180   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12181   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12182   HOST_WIDE_INT size, rsize, adjust, align;
12183   tree t, u, cond1, cond2;
12184
12185   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12186   if (indirect_p)
12187     type = build_pointer_type (type);
12188
12189   mode = TYPE_MODE (type);
12190
12191   f_stack = TYPE_FIELDS (va_list_type_node);
12192   f_grtop = DECL_CHAIN (f_stack);
12193   f_vrtop = DECL_CHAIN (f_grtop);
12194   f_groff = DECL_CHAIN (f_vrtop);
12195   f_vroff = DECL_CHAIN (f_groff);
12196
12197   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12198                   f_stack, NULL_TREE);
12199   size = int_size_in_bytes (type);
12200   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12201
12202   dw_align = false;
12203   adjust = 0;
12204   if (aarch64_vfp_is_call_or_return_candidate (mode,
12205                                                type,
12206                                                &ag_mode,
12207                                                &nregs,
12208                                                &is_ha))
12209     {
12210       /* No frontends can create types with variable-sized modes, so we
12211          shouldn't be asked to pass or return them.  */
12212       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12213
12214       /* TYPE passed in fp/simd registers.  */
12215       if (!TARGET_FLOAT)
12216         aarch64_err_no_fpadvsimd (mode, "varargs");
12217
12218       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12219                       unshare_expr (valist), f_vrtop, NULL_TREE);
12220       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12221                       unshare_expr (valist), f_vroff, NULL_TREE);
12222
12223       rsize = nregs * UNITS_PER_VREG;
12224
12225       if (is_ha)
12226         {
12227           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12228             adjust = UNITS_PER_VREG - ag_size;
12229         }
12230       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12231                && size < UNITS_PER_VREG)
12232         {
12233           adjust = UNITS_PER_VREG - size;
12234         }
12235     }
12236   else
12237     {
12238       /* TYPE passed in general registers.  */
12239       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12240                       unshare_expr (valist), f_grtop, NULL_TREE);
12241       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12242                       unshare_expr (valist), f_groff, NULL_TREE);
12243       rsize = ROUND_UP (size, UNITS_PER_WORD);
12244       nregs = rsize / UNITS_PER_WORD;
12245
12246       if (align > 8)
12247         dw_align = true;
12248
12249       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12250           && size < UNITS_PER_WORD)
12251         {
12252           adjust = UNITS_PER_WORD  - size;
12253         }
12254     }
12255
12256   /* Get a local temporary for the field value.  */
12257   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12258
12259   /* Emit code to branch if off >= 0.  */
12260   t = build2 (GE_EXPR, boolean_type_node, off,
12261               build_int_cst (TREE_TYPE (off), 0));
12262   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12263
12264   if (dw_align)
12265     {
12266       /* Emit: offs = (offs + 15) & -16.  */
12267       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12268                   build_int_cst (TREE_TYPE (off), 15));
12269       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12270                   build_int_cst (TREE_TYPE (off), -16));
12271       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12272     }
12273   else
12274     roundup = NULL;
12275
12276   /* Update ap.__[g|v]r_offs  */
12277   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12278               build_int_cst (TREE_TYPE (off), rsize));
12279   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12280
12281   /* String up.  */
12282   if (roundup)
12283     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12284
12285   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12286   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12287               build_int_cst (TREE_TYPE (f_off), 0));
12288   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12289
12290   /* String up: make sure the assignment happens before the use.  */
12291   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12292   COND_EXPR_ELSE (cond1) = t;
12293
12294   /* Prepare the trees handling the argument that is passed on the stack;
12295      the top level node will store in ON_STACK.  */
12296   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12297   if (align > 8)
12298     {
12299       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12300       t = fold_build_pointer_plus_hwi (arg, 15);
12301       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12302                   build_int_cst (TREE_TYPE (t), -16));
12303       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12304     }
12305   else
12306     roundup = NULL;
12307   /* Advance ap.__stack  */
12308   t = fold_build_pointer_plus_hwi (arg, size + 7);
12309   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12310               build_int_cst (TREE_TYPE (t), -8));
12311   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12312   /* String up roundup and advance.  */
12313   if (roundup)
12314     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12315   /* String up with arg */
12316   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12317   /* Big-endianness related address adjustment.  */
12318   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12319       && size < UNITS_PER_WORD)
12320   {
12321     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12322                 size_int (UNITS_PER_WORD - size));
12323     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12324   }
12325
12326   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12327   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12328
12329   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12330   t = off;
12331   if (adjust)
12332     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12333                 build_int_cst (TREE_TYPE (off), adjust));
12334
12335   t = fold_convert (sizetype, t);
12336   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12337
12338   if (is_ha)
12339     {
12340       /* type ha; // treat as "struct {ftype field[n];}"
12341          ... [computing offs]
12342          for (i = 0; i <nregs; ++i, offs += 16)
12343            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12344          return ha;  */
12345       int i;
12346       tree tmp_ha, field_t, field_ptr_t;
12347
12348       /* Declare a local variable.  */
12349       tmp_ha = create_tmp_var_raw (type, "ha");
12350       gimple_add_tmp_var (tmp_ha);
12351
12352       /* Establish the base type.  */
12353       switch (ag_mode)
12354         {
12355         case E_SFmode:
12356           field_t = float_type_node;
12357           field_ptr_t = float_ptr_type_node;
12358           break;
12359         case E_DFmode:
12360           field_t = double_type_node;
12361           field_ptr_t = double_ptr_type_node;
12362           break;
12363         case E_TFmode:
12364           field_t = long_double_type_node;
12365           field_ptr_t = long_double_ptr_type_node;
12366           break;
12367         case E_HFmode:
12368           field_t = aarch64_fp16_type_node;
12369           field_ptr_t = aarch64_fp16_ptr_type_node;
12370           break;
12371         case E_V2SImode:
12372         case E_V4SImode:
12373             {
12374               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12375               field_t = build_vector_type_for_mode (innertype, ag_mode);
12376               field_ptr_t = build_pointer_type (field_t);
12377             }
12378           break;
12379         default:
12380           gcc_assert (0);
12381         }
12382
12383       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12384       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12385       addr = t;
12386       t = fold_convert (field_ptr_t, addr);
12387       t = build2 (MODIFY_EXPR, field_t,
12388                   build1 (INDIRECT_REF, field_t, tmp_ha),
12389                   build1 (INDIRECT_REF, field_t, t));
12390
12391       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12392       for (i = 1; i < nregs; ++i)
12393         {
12394           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12395           u = fold_convert (field_ptr_t, addr);
12396           u = build2 (MODIFY_EXPR, field_t,
12397                       build2 (MEM_REF, field_t, tmp_ha,
12398                               build_int_cst (field_ptr_t,
12399                                              (i *
12400                                               int_size_in_bytes (field_t)))),
12401                       build1 (INDIRECT_REF, field_t, u));
12402           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12403         }
12404
12405       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12406       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12407     }
12408
12409   COND_EXPR_ELSE (cond2) = t;
12410   addr = fold_convert (build_pointer_type (type), cond1);
12411   addr = build_va_arg_indirect_ref (addr);
12412
12413   if (indirect_p)
12414     addr = build_va_arg_indirect_ref (addr);
12415
12416   return addr;
12417 }
12418
12419 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12420
12421 static void
12422 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12423                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12424                                 int no_rtl)
12425 {
12426   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12427   CUMULATIVE_ARGS local_cum;
12428   int gr_saved = cfun->va_list_gpr_size;
12429   int vr_saved = cfun->va_list_fpr_size;
12430
12431   /* The caller has advanced CUM up to, but not beyond, the last named
12432      argument.  Advance a local copy of CUM past the last "real" named
12433      argument, to find out how many registers are left over.  */
12434   local_cum = *cum;
12435   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12436
12437   /* Found out how many registers we need to save.
12438      Honor tree-stdvar analysis results.  */
12439   if (cfun->va_list_gpr_size)
12440     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12441                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12442   if (cfun->va_list_fpr_size)
12443     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12444                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12445
12446   if (!TARGET_FLOAT)
12447     {
12448       gcc_assert (local_cum.aapcs_nvrn == 0);
12449       vr_saved = 0;
12450     }
12451
12452   if (!no_rtl)
12453     {
12454       if (gr_saved > 0)
12455         {
12456           rtx ptr, mem;
12457
12458           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12459           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12460                                - gr_saved * UNITS_PER_WORD);
12461           mem = gen_frame_mem (BLKmode, ptr);
12462           set_mem_alias_set (mem, get_varargs_alias_set ());
12463
12464           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12465                                mem, gr_saved);
12466         }
12467       if (vr_saved > 0)
12468         {
12469           /* We can't use move_block_from_reg, because it will use
12470              the wrong mode, storing D regs only.  */
12471           machine_mode mode = TImode;
12472           int off, i, vr_start;
12473
12474           /* Set OFF to the offset from virtual_incoming_args_rtx of
12475              the first vector register.  The VR save area lies below
12476              the GR one, and is aligned to 16 bytes.  */
12477           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12478                            STACK_BOUNDARY / BITS_PER_UNIT);
12479           off -= vr_saved * UNITS_PER_VREG;
12480
12481           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12482           for (i = 0; i < vr_saved; ++i)
12483             {
12484               rtx ptr, mem;
12485
12486               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12487               mem = gen_frame_mem (mode, ptr);
12488               set_mem_alias_set (mem, get_varargs_alias_set ());
12489               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12490               off += UNITS_PER_VREG;
12491             }
12492         }
12493     }
12494
12495   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12496      any complication of having crtl->args.pretend_args_size changed.  */
12497   cfun->machine->frame.saved_varargs_size
12498     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12499                  STACK_BOUNDARY / BITS_PER_UNIT)
12500        + vr_saved * UNITS_PER_VREG);
12501 }
12502
12503 static void
12504 aarch64_conditional_register_usage (void)
12505 {
12506   int i;
12507   if (!TARGET_FLOAT)
12508     {
12509       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12510         {
12511           fixed_regs[i] = 1;
12512           call_used_regs[i] = 1;
12513         }
12514     }
12515   if (!TARGET_SVE)
12516     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12517       {
12518         fixed_regs[i] = 1;
12519         call_used_regs[i] = 1;
12520       }
12521 }
12522
12523 /* Walk down the type tree of TYPE counting consecutive base elements.
12524    If *MODEP is VOIDmode, then set it to the first valid floating point
12525    type.  If a non-floating point type is found, or if a floating point
12526    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12527    otherwise return the count in the sub-tree.  */
12528 static int
12529 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12530 {
12531   machine_mode mode;
12532   HOST_WIDE_INT size;
12533
12534   switch (TREE_CODE (type))
12535     {
12536     case REAL_TYPE:
12537       mode = TYPE_MODE (type);
12538       if (mode != DFmode && mode != SFmode
12539           && mode != TFmode && mode != HFmode)
12540         return -1;
12541
12542       if (*modep == VOIDmode)
12543         *modep = mode;
12544
12545       if (*modep == mode)
12546         return 1;
12547
12548       break;
12549
12550     case COMPLEX_TYPE:
12551       mode = TYPE_MODE (TREE_TYPE (type));
12552       if (mode != DFmode && mode != SFmode
12553           && mode != TFmode && mode != HFmode)
12554         return -1;
12555
12556       if (*modep == VOIDmode)
12557         *modep = mode;
12558
12559       if (*modep == mode)
12560         return 2;
12561
12562       break;
12563
12564     case VECTOR_TYPE:
12565       /* Use V2SImode and V4SImode as representatives of all 64-bit
12566          and 128-bit vector types.  */
12567       size = int_size_in_bytes (type);
12568       switch (size)
12569         {
12570         case 8:
12571           mode = V2SImode;
12572           break;
12573         case 16:
12574           mode = V4SImode;
12575           break;
12576         default:
12577           return -1;
12578         }
12579
12580       if (*modep == VOIDmode)
12581         *modep = mode;
12582
12583       /* Vector modes are considered to be opaque: two vectors are
12584          equivalent for the purposes of being homogeneous aggregates
12585          if they are the same size.  */
12586       if (*modep == mode)
12587         return 1;
12588
12589       break;
12590
12591     case ARRAY_TYPE:
12592       {
12593         int count;
12594         tree index = TYPE_DOMAIN (type);
12595
12596         /* Can't handle incomplete types nor sizes that are not
12597            fixed.  */
12598         if (!COMPLETE_TYPE_P (type)
12599             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12600           return -1;
12601
12602         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12603         if (count == -1
12604             || !index
12605             || !TYPE_MAX_VALUE (index)
12606             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12607             || !TYPE_MIN_VALUE (index)
12608             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12609             || count < 0)
12610           return -1;
12611
12612         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12613                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12614
12615         /* There must be no padding.  */
12616         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12617                       count * GET_MODE_BITSIZE (*modep)))
12618           return -1;
12619
12620         return count;
12621       }
12622
12623     case RECORD_TYPE:
12624       {
12625         int count = 0;
12626         int sub_count;
12627         tree field;
12628
12629         /* Can't handle incomplete types nor sizes that are not
12630            fixed.  */
12631         if (!COMPLETE_TYPE_P (type)
12632             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12633           return -1;
12634
12635         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12636           {
12637             if (TREE_CODE (field) != FIELD_DECL)
12638               continue;
12639
12640             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12641             if (sub_count < 0)
12642               return -1;
12643             count += sub_count;
12644           }
12645
12646         /* There must be no padding.  */
12647         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12648                       count * GET_MODE_BITSIZE (*modep)))
12649           return -1;
12650
12651         return count;
12652       }
12653
12654     case UNION_TYPE:
12655     case QUAL_UNION_TYPE:
12656       {
12657         /* These aren't very interesting except in a degenerate case.  */
12658         int count = 0;
12659         int sub_count;
12660         tree field;
12661
12662         /* Can't handle incomplete types nor sizes that are not
12663            fixed.  */
12664         if (!COMPLETE_TYPE_P (type)
12665             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12666           return -1;
12667
12668         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12669           {
12670             if (TREE_CODE (field) != FIELD_DECL)
12671               continue;
12672
12673             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12674             if (sub_count < 0)
12675               return -1;
12676             count = count > sub_count ? count : sub_count;
12677           }
12678
12679         /* There must be no padding.  */
12680         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12681                       count * GET_MODE_BITSIZE (*modep)))
12682           return -1;
12683
12684         return count;
12685       }
12686
12687     default:
12688       break;
12689     }
12690
12691   return -1;
12692 }
12693
12694 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12695    type as described in AAPCS64 \S 4.1.2.
12696
12697    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12698
12699 static bool
12700 aarch64_short_vector_p (const_tree type,
12701                         machine_mode mode)
12702 {
12703   poly_int64 size = -1;
12704
12705   if (type && TREE_CODE (type) == VECTOR_TYPE)
12706     size = int_size_in_bytes (type);
12707   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12708             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12709     size = GET_MODE_SIZE (mode);
12710
12711   return known_eq (size, 8) || known_eq (size, 16);
12712 }
12713
12714 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12715    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12716    array types.  The C99 floating-point complex types are also considered
12717    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12718    types, which are GCC extensions and out of the scope of AAPCS64, are
12719    treated as composite types here as well.
12720
12721    Note that MODE itself is not sufficient in determining whether a type
12722    is such a composite type or not.  This is because
12723    stor-layout.c:compute_record_mode may have already changed the MODE
12724    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12725    structure with only one field may have its MODE set to the mode of the
12726    field.  Also an integer mode whose size matches the size of the
12727    RECORD_TYPE type may be used to substitute the original mode
12728    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12729    solely relied on.  */
12730
12731 static bool
12732 aarch64_composite_type_p (const_tree type,
12733                           machine_mode mode)
12734 {
12735   if (aarch64_short_vector_p (type, mode))
12736     return false;
12737
12738   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12739     return true;
12740
12741   if (mode == BLKmode
12742       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12743       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12744     return true;
12745
12746   return false;
12747 }
12748
12749 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12750    shall be passed or returned in simd/fp register(s) (providing these
12751    parameter passing registers are available).
12752
12753    Upon successful return, *COUNT returns the number of needed registers,
12754    *BASE_MODE returns the mode of the individual register and when IS_HAF
12755    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12756    floating-point aggregate or a homogeneous short-vector aggregate.  */
12757
12758 static bool
12759 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12760                                          const_tree type,
12761                                          machine_mode *base_mode,
12762                                          int *count,
12763                                          bool *is_ha)
12764 {
12765   machine_mode new_mode = VOIDmode;
12766   bool composite_p = aarch64_composite_type_p (type, mode);
12767
12768   if (is_ha != NULL) *is_ha = false;
12769
12770   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12771       || aarch64_short_vector_p (type, mode))
12772     {
12773       *count = 1;
12774       new_mode = mode;
12775     }
12776   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12777     {
12778       if (is_ha != NULL) *is_ha = true;
12779       *count = 2;
12780       new_mode = GET_MODE_INNER (mode);
12781     }
12782   else if (type && composite_p)
12783     {
12784       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12785
12786       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12787         {
12788           if (is_ha != NULL) *is_ha = true;
12789           *count = ag_count;
12790         }
12791       else
12792         return false;
12793     }
12794   else
12795     return false;
12796
12797   *base_mode = new_mode;
12798   return true;
12799 }
12800
12801 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12802
12803 static rtx
12804 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12805                           int incoming ATTRIBUTE_UNUSED)
12806 {
12807   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12808 }
12809
12810 /* Implements target hook vector_mode_supported_p.  */
12811 static bool
12812 aarch64_vector_mode_supported_p (machine_mode mode)
12813 {
12814   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12815   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12816 }
12817
12818 /* Return appropriate SIMD container
12819    for MODE within a vector of WIDTH bits.  */
12820 static machine_mode
12821 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12822 {
12823   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12824     switch (mode)
12825       {
12826       case E_DFmode:
12827         return VNx2DFmode;
12828       case E_SFmode:
12829         return VNx4SFmode;
12830       case E_HFmode:
12831         return VNx8HFmode;
12832       case E_DImode:
12833         return VNx2DImode;
12834       case E_SImode:
12835         return VNx4SImode;
12836       case E_HImode:
12837         return VNx8HImode;
12838       case E_QImode:
12839         return VNx16QImode;
12840       default:
12841         return word_mode;
12842       }
12843
12844   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12845   if (TARGET_SIMD)
12846     {
12847       if (known_eq (width, 128))
12848         switch (mode)
12849           {
12850           case E_DFmode:
12851             return V2DFmode;
12852           case E_SFmode:
12853             return V4SFmode;
12854           case E_HFmode:
12855             return V8HFmode;
12856           case E_SImode:
12857             return V4SImode;
12858           case E_HImode:
12859             return V8HImode;
12860           case E_QImode:
12861             return V16QImode;
12862           case E_DImode:
12863             return V2DImode;
12864           default:
12865             break;
12866           }
12867       else
12868         switch (mode)
12869           {
12870           case E_SFmode:
12871             return V2SFmode;
12872           case E_HFmode:
12873             return V4HFmode;
12874           case E_SImode:
12875             return V2SImode;
12876           case E_HImode:
12877             return V4HImode;
12878           case E_QImode:
12879             return V8QImode;
12880           default:
12881             break;
12882           }
12883     }
12884   return word_mode;
12885 }
12886
12887 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12888 static machine_mode
12889 aarch64_preferred_simd_mode (scalar_mode mode)
12890 {
12891   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12892   return aarch64_simd_container_mode (mode, bits);
12893 }
12894
12895 /* Return a list of possible vector sizes for the vectorizer
12896    to iterate over.  */
12897 static void
12898 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12899 {
12900   if (TARGET_SVE)
12901     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12902   sizes->safe_push (16);
12903   sizes->safe_push (8);
12904 }
12905
12906 /* Implement TARGET_MANGLE_TYPE.  */
12907
12908 static const char *
12909 aarch64_mangle_type (const_tree type)
12910 {
12911   /* The AArch64 ABI documents say that "__va_list" has to be
12912      managled as if it is in the "std" namespace.  */
12913   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12914     return "St9__va_list";
12915
12916   /* Half-precision float.  */
12917   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12918     return "Dh";
12919
12920   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12921      builtin types.  */
12922   if (TYPE_NAME (type) != NULL)
12923     return aarch64_mangle_builtin_type (type);
12924
12925   /* Use the default mangling.  */
12926   return NULL;
12927 }
12928
12929 /* Find the first rtx_insn before insn that will generate an assembly
12930    instruction.  */
12931
12932 static rtx_insn *
12933 aarch64_prev_real_insn (rtx_insn *insn)
12934 {
12935   if (!insn)
12936     return NULL;
12937
12938   do
12939     {
12940       insn = prev_real_insn (insn);
12941     }
12942   while (insn && recog_memoized (insn) < 0);
12943
12944   return insn;
12945 }
12946
12947 static bool
12948 is_madd_op (enum attr_type t1)
12949 {
12950   unsigned int i;
12951   /* A number of these may be AArch32 only.  */
12952   enum attr_type mlatypes[] = {
12953     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12954     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12955     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12956   };
12957
12958   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12959     {
12960       if (t1 == mlatypes[i])
12961         return true;
12962     }
12963
12964   return false;
12965 }
12966
12967 /* Check if there is a register dependency between a load and the insn
12968    for which we hold recog_data.  */
12969
12970 static bool
12971 dep_between_memop_and_curr (rtx memop)
12972 {
12973   rtx load_reg;
12974   int opno;
12975
12976   gcc_assert (GET_CODE (memop) == SET);
12977
12978   if (!REG_P (SET_DEST (memop)))
12979     return false;
12980
12981   load_reg = SET_DEST (memop);
12982   for (opno = 1; opno < recog_data.n_operands; opno++)
12983     {
12984       rtx operand = recog_data.operand[opno];
12985       if (REG_P (operand)
12986           && reg_overlap_mentioned_p (load_reg, operand))
12987         return true;
12988
12989     }
12990   return false;
12991 }
12992
12993
12994 /* When working around the Cortex-A53 erratum 835769,
12995    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12996    instruction and has a preceding memory instruction such that a NOP
12997    should be inserted between them.  */
12998
12999 bool
13000 aarch64_madd_needs_nop (rtx_insn* insn)
13001 {
13002   enum attr_type attr_type;
13003   rtx_insn *prev;
13004   rtx body;
13005
13006   if (!TARGET_FIX_ERR_A53_835769)
13007     return false;
13008
13009   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13010     return false;
13011
13012   attr_type = get_attr_type (insn);
13013   if (!is_madd_op (attr_type))
13014     return false;
13015
13016   prev = aarch64_prev_real_insn (insn);
13017   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13018      Restore recog state to INSN to avoid state corruption.  */
13019   extract_constrain_insn_cached (insn);
13020
13021   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13022     return false;
13023
13024   body = single_set (prev);
13025
13026   /* If the previous insn is a memory op and there is no dependency between
13027      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13028      have a complex memory operation, probably a load/store pair.
13029      Be conservative for now and emit a NOP.  */
13030   if (GET_MODE (recog_data.operand[0]) == DImode
13031       && (!body || !dep_between_memop_and_curr (body)))
13032     return true;
13033
13034   return false;
13035
13036 }
13037
13038
13039 /* Implement FINAL_PRESCAN_INSN.  */
13040
13041 void
13042 aarch64_final_prescan_insn (rtx_insn *insn)
13043 {
13044   if (aarch64_madd_needs_nop (insn))
13045     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13046 }
13047
13048
13049 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13050    instruction.  */
13051
13052 bool
13053 aarch64_sve_index_immediate_p (rtx base_or_step)
13054 {
13055   return (CONST_INT_P (base_or_step)
13056           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13057 }
13058
13059 /* Return true if X is a valid immediate for the SVE ADD and SUB
13060    instructions.  Negate X first if NEGATE_P is true.  */
13061
13062 bool
13063 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13064 {
13065   rtx elt;
13066
13067   if (!const_vec_duplicate_p (x, &elt)
13068       || !CONST_INT_P (elt))
13069     return false;
13070
13071   HOST_WIDE_INT val = INTVAL (elt);
13072   if (negate_p)
13073     val = -val;
13074   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13075
13076   if (val & 0xff)
13077     return IN_RANGE (val, 0, 0xff);
13078   return IN_RANGE (val, 0, 0xff00);
13079 }
13080
13081 /* Return true if X is a valid immediate operand for an SVE logical
13082    instruction such as AND.  */
13083
13084 bool
13085 aarch64_sve_bitmask_immediate_p (rtx x)
13086 {
13087   rtx elt;
13088
13089   return (const_vec_duplicate_p (x, &elt)
13090           && CONST_INT_P (elt)
13091           && aarch64_bitmask_imm (INTVAL (elt),
13092                                   GET_MODE_INNER (GET_MODE (x))));
13093 }
13094
13095 /* Return true if X is a valid immediate for the SVE DUP and CPY
13096    instructions.  */
13097
13098 bool
13099 aarch64_sve_dup_immediate_p (rtx x)
13100 {
13101   rtx elt;
13102
13103   if (!const_vec_duplicate_p (x, &elt)
13104       || !CONST_INT_P (elt))
13105     return false;
13106
13107   HOST_WIDE_INT val = INTVAL (elt);
13108   if (val & 0xff)
13109     return IN_RANGE (val, -0x80, 0x7f);
13110   return IN_RANGE (val, -0x8000, 0x7f00);
13111 }
13112
13113 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13114    SIGNED_P says whether the operand is signed rather than unsigned.  */
13115
13116 bool
13117 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13118 {
13119   rtx elt;
13120
13121   return (const_vec_duplicate_p (x, &elt)
13122           && CONST_INT_P (elt)
13123           && (signed_p
13124               ? IN_RANGE (INTVAL (elt), -16, 15)
13125               : IN_RANGE (INTVAL (elt), 0, 127)));
13126 }
13127
13128 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13129    instruction.  Negate X first if NEGATE_P is true.  */
13130
13131 bool
13132 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13133 {
13134   rtx elt;
13135   REAL_VALUE_TYPE r;
13136
13137   if (!const_vec_duplicate_p (x, &elt)
13138       || GET_CODE (elt) != CONST_DOUBLE)
13139     return false;
13140
13141   r = *CONST_DOUBLE_REAL_VALUE (elt);
13142
13143   if (negate_p)
13144     r = real_value_negate (&r);
13145
13146   if (real_equal (&r, &dconst1))
13147     return true;
13148   if (real_equal (&r, &dconsthalf))
13149     return true;
13150   return false;
13151 }
13152
13153 /* Return true if X is a valid immediate operand for an SVE FMUL
13154    instruction.  */
13155
13156 bool
13157 aarch64_sve_float_mul_immediate_p (rtx x)
13158 {
13159   rtx elt;
13160
13161   /* GCC will never generate a multiply with an immediate of 2, so there is no
13162      point testing for it (even though it is a valid constant).  */
13163   return (const_vec_duplicate_p (x, &elt)
13164           && GET_CODE (elt) == CONST_DOUBLE
13165           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13166 }
13167
13168 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13169    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13170    is nonnull, use it to describe valid immediates.  */
13171 static bool
13172 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13173                                     simd_immediate_info *info,
13174                                     enum simd_immediate_check which,
13175                                     simd_immediate_info::insn_type insn)
13176 {
13177   /* Try a 4-byte immediate with LSL.  */
13178   for (unsigned int shift = 0; shift < 32; shift += 8)
13179     if ((val32 & (0xff << shift)) == val32)
13180       {
13181         if (info)
13182           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13183                                        simd_immediate_info::LSL, shift);
13184         return true;
13185       }
13186
13187   /* Try a 2-byte immediate with LSL.  */
13188   unsigned int imm16 = val32 & 0xffff;
13189   if (imm16 == (val32 >> 16))
13190     for (unsigned int shift = 0; shift < 16; shift += 8)
13191       if ((imm16 & (0xff << shift)) == imm16)
13192         {
13193           if (info)
13194             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13195                                          simd_immediate_info::LSL, shift);
13196           return true;
13197         }
13198
13199   /* Try a 4-byte immediate with MSL, except for cases that MVN
13200      can handle.  */
13201   if (which == AARCH64_CHECK_MOV)
13202     for (unsigned int shift = 8; shift < 24; shift += 8)
13203       {
13204         unsigned int low = (1 << shift) - 1;
13205         if (((val32 & (0xff << shift)) | low) == val32)
13206           {
13207             if (info)
13208               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13209                                            simd_immediate_info::MSL, shift);
13210             return true;
13211           }
13212       }
13213
13214   return false;
13215 }
13216
13217 /* Return true if replicating VAL64 is a valid immediate for the
13218    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13219    use it to describe valid immediates.  */
13220 static bool
13221 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13222                                  simd_immediate_info *info,
13223                                  enum simd_immediate_check which)
13224 {
13225   unsigned int val32 = val64 & 0xffffffff;
13226   unsigned int val16 = val64 & 0xffff;
13227   unsigned int val8 = val64 & 0xff;
13228
13229   if (val32 == (val64 >> 32))
13230     {
13231       if ((which & AARCH64_CHECK_ORR) != 0
13232           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13233                                                  simd_immediate_info::MOV))
13234         return true;
13235
13236       if ((which & AARCH64_CHECK_BIC) != 0
13237           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13238                                                  simd_immediate_info::MVN))
13239         return true;
13240
13241       /* Try using a replicated byte.  */
13242       if (which == AARCH64_CHECK_MOV
13243           && val16 == (val32 >> 16)
13244           && val8 == (val16 >> 8))
13245         {
13246           if (info)
13247             *info = simd_immediate_info (QImode, val8);
13248           return true;
13249         }
13250     }
13251
13252   /* Try using a bit-to-bytemask.  */
13253   if (which == AARCH64_CHECK_MOV)
13254     {
13255       unsigned int i;
13256       for (i = 0; i < 64; i += 8)
13257         {
13258           unsigned char byte = (val64 >> i) & 0xff;
13259           if (byte != 0 && byte != 0xff)
13260             break;
13261         }
13262       if (i == 64)
13263         {
13264           if (info)
13265             *info = simd_immediate_info (DImode, val64);
13266           return true;
13267         }
13268     }
13269   return false;
13270 }
13271
13272 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13273    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13274
13275 static bool
13276 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13277                              simd_immediate_info *info)
13278 {
13279   scalar_int_mode mode = DImode;
13280   unsigned int val32 = val64 & 0xffffffff;
13281   if (val32 == (val64 >> 32))
13282     {
13283       mode = SImode;
13284       unsigned int val16 = val32 & 0xffff;
13285       if (val16 == (val32 >> 16))
13286         {
13287           mode = HImode;
13288           unsigned int val8 = val16 & 0xff;
13289           if (val8 == (val16 >> 8))
13290             mode = QImode;
13291         }
13292     }
13293   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13294   if (IN_RANGE (val, -0x80, 0x7f))
13295     {
13296       /* DUP with no shift.  */
13297       if (info)
13298         *info = simd_immediate_info (mode, val);
13299       return true;
13300     }
13301   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13302     {
13303       /* DUP with LSL #8.  */
13304       if (info)
13305         *info = simd_immediate_info (mode, val);
13306       return true;
13307     }
13308   if (aarch64_bitmask_imm (val64, mode))
13309     {
13310       /* DUPM.  */
13311       if (info)
13312         *info = simd_immediate_info (mode, val);
13313       return true;
13314     }
13315   return false;
13316 }
13317
13318 /* Return true if OP is a valid SIMD immediate for the operation
13319    described by WHICH.  If INFO is nonnull, use it to describe valid
13320    immediates.  */
13321 bool
13322 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13323                               enum simd_immediate_check which)
13324 {
13325   machine_mode mode = GET_MODE (op);
13326   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13327   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13328     return false;
13329
13330   scalar_mode elt_mode = GET_MODE_INNER (mode);
13331   rtx base, step;
13332   unsigned int n_elts;
13333   if (GET_CODE (op) == CONST_VECTOR
13334       && CONST_VECTOR_DUPLICATE_P (op))
13335     n_elts = CONST_VECTOR_NPATTERNS (op);
13336   else if ((vec_flags & VEC_SVE_DATA)
13337            && const_vec_series_p (op, &base, &step))
13338     {
13339       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13340       if (!aarch64_sve_index_immediate_p (base)
13341           || !aarch64_sve_index_immediate_p (step))
13342         return false;
13343
13344       if (info)
13345         *info = simd_immediate_info (elt_mode, base, step);
13346       return true;
13347     }
13348   else if (GET_CODE (op) == CONST_VECTOR
13349            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13350     /* N_ELTS set above.  */;
13351   else
13352     return false;
13353
13354   /* Handle PFALSE and PTRUE.  */
13355   if (vec_flags & VEC_SVE_PRED)
13356     return (op == CONST0_RTX (mode)
13357             || op == CONSTM1_RTX (mode));
13358
13359   scalar_float_mode elt_float_mode;
13360   if (n_elts == 1
13361       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13362     {
13363       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13364       if (aarch64_float_const_zero_rtx_p (elt)
13365           || aarch64_float_const_representable_p (elt))
13366         {
13367           if (info)
13368             *info = simd_immediate_info (elt_float_mode, elt);
13369           return true;
13370         }
13371     }
13372
13373   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13374   if (elt_size > 8)
13375     return false;
13376
13377   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13378
13379   /* Expand the vector constant out into a byte vector, with the least
13380      significant byte of the register first.  */
13381   auto_vec<unsigned char, 16> bytes;
13382   bytes.reserve (n_elts * elt_size);
13383   for (unsigned int i = 0; i < n_elts; i++)
13384     {
13385       /* The vector is provided in gcc endian-neutral fashion.
13386          For aarch64_be Advanced SIMD, it must be laid out in the vector
13387          register in reverse order.  */
13388       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13389       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13390
13391       if (elt_mode != elt_int_mode)
13392         elt = gen_lowpart (elt_int_mode, elt);
13393
13394       if (!CONST_INT_P (elt))
13395         return false;
13396
13397       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13398       for (unsigned int byte = 0; byte < elt_size; byte++)
13399         {
13400           bytes.quick_push (elt_val & 0xff);
13401           elt_val >>= BITS_PER_UNIT;
13402         }
13403     }
13404
13405   /* The immediate must repeat every eight bytes.  */
13406   unsigned int nbytes = bytes.length ();
13407   for (unsigned i = 8; i < nbytes; ++i)
13408     if (bytes[i] != bytes[i - 8])
13409       return false;
13410
13411   /* Get the repeating 8-byte value as an integer.  No endian correction
13412      is needed here because bytes is already in lsb-first order.  */
13413   unsigned HOST_WIDE_INT val64 = 0;
13414   for (unsigned int i = 0; i < 8; i++)
13415     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13416               << (i * BITS_PER_UNIT));
13417
13418   if (vec_flags & VEC_SVE_DATA)
13419     return aarch64_sve_valid_immediate (val64, info);
13420   else
13421     return aarch64_advsimd_valid_immediate (val64, info, which);
13422 }
13423
13424 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13425    has a step in the range of INDEX.  Return the index expression if so,
13426    otherwise return null.  */
13427 rtx
13428 aarch64_check_zero_based_sve_index_immediate (rtx x)
13429 {
13430   rtx base, step;
13431   if (const_vec_series_p (x, &base, &step)
13432       && base == const0_rtx
13433       && aarch64_sve_index_immediate_p (step))
13434     return step;
13435   return NULL_RTX;
13436 }
13437
13438 /* Check of immediate shift constants are within range.  */
13439 bool
13440 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13441 {
13442   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13443   if (left)
13444     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13445   else
13446     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13447 }
13448
13449 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13450    operation of width WIDTH at bit position POS.  */
13451
13452 rtx
13453 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13454 {
13455   gcc_assert (CONST_INT_P (width));
13456   gcc_assert (CONST_INT_P (pos));
13457
13458   unsigned HOST_WIDE_INT mask
13459     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13460   return GEN_INT (mask << UINTVAL (pos));
13461 }
13462
13463 bool
13464 aarch64_mov_operand_p (rtx x, machine_mode mode)
13465 {
13466   if (GET_CODE (x) == HIGH
13467       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13468     return true;
13469
13470   if (CONST_INT_P (x))
13471     return true;
13472
13473   if (VECTOR_MODE_P (GET_MODE (x)))
13474     return aarch64_simd_valid_immediate (x, NULL);
13475
13476   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13477     return true;
13478
13479   if (aarch64_sve_cnt_immediate_p (x))
13480     return true;
13481
13482   return aarch64_classify_symbolic_expression (x)
13483     == SYMBOL_TINY_ABSOLUTE;
13484 }
13485
13486 /* Return a const_int vector of VAL.  */
13487 rtx
13488 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13489 {
13490   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13491   return gen_const_vec_duplicate (mode, c);
13492 }
13493
13494 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13495
13496 bool
13497 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13498 {
13499   machine_mode vmode;
13500
13501   vmode = aarch64_simd_container_mode (mode, 64);
13502   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13503   return aarch64_simd_valid_immediate (op_v, NULL);
13504 }
13505
13506 /* Construct and return a PARALLEL RTX vector with elements numbering the
13507    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13508    the vector - from the perspective of the architecture.  This does not
13509    line up with GCC's perspective on lane numbers, so we end up with
13510    different masks depending on our target endian-ness.  The diagram
13511    below may help.  We must draw the distinction when building masks
13512    which select one half of the vector.  An instruction selecting
13513    architectural low-lanes for a big-endian target, must be described using
13514    a mask selecting GCC high-lanes.
13515
13516                  Big-Endian             Little-Endian
13517
13518 GCC             0   1   2   3           3   2   1   0
13519               | x | x | x | x |       | x | x | x | x |
13520 Architecture    3   2   1   0           3   2   1   0
13521
13522 Low Mask:         { 2, 3 }                { 0, 1 }
13523 High Mask:        { 0, 1 }                { 2, 3 }
13524
13525    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13526
13527 rtx
13528 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13529 {
13530   rtvec v = rtvec_alloc (nunits / 2);
13531   int high_base = nunits / 2;
13532   int low_base = 0;
13533   int base;
13534   rtx t1;
13535   int i;
13536
13537   if (BYTES_BIG_ENDIAN)
13538     base = high ? low_base : high_base;
13539   else
13540     base = high ? high_base : low_base;
13541
13542   for (i = 0; i < nunits / 2; i++)
13543     RTVEC_ELT (v, i) = GEN_INT (base + i);
13544
13545   t1 = gen_rtx_PARALLEL (mode, v);
13546   return t1;
13547 }
13548
13549 /* Check OP for validity as a PARALLEL RTX vector with elements
13550    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13551    from the perspective of the architecture.  See the diagram above
13552    aarch64_simd_vect_par_cnst_half for more details.  */
13553
13554 bool
13555 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13556                                        bool high)
13557 {
13558   int nelts;
13559   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13560     return false;
13561
13562   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13563   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13564   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13565   int i = 0;
13566
13567   if (count_op != count_ideal)
13568     return false;
13569
13570   for (i = 0; i < count_ideal; i++)
13571     {
13572       rtx elt_op = XVECEXP (op, 0, i);
13573       rtx elt_ideal = XVECEXP (ideal, 0, i);
13574
13575       if (!CONST_INT_P (elt_op)
13576           || INTVAL (elt_ideal) != INTVAL (elt_op))
13577         return false;
13578     }
13579   return true;
13580 }
13581
13582 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13583    HIGH (exclusive).  */
13584 void
13585 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13586                           const_tree exp)
13587 {
13588   HOST_WIDE_INT lane;
13589   gcc_assert (CONST_INT_P (operand));
13590   lane = INTVAL (operand);
13591
13592   if (lane < low || lane >= high)
13593   {
13594     if (exp)
13595       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13596     else
13597       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13598   }
13599 }
13600
13601 /* Peform endian correction on lane number N, which indexes a vector
13602    of mode MODE, and return the result as an SImode rtx.  */
13603
13604 rtx
13605 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13606 {
13607   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13608 }
13609
13610 /* Return TRUE if OP is a valid vector addressing mode.  */
13611
13612 bool
13613 aarch64_simd_mem_operand_p (rtx op)
13614 {
13615   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13616                         || REG_P (XEXP (op, 0)));
13617 }
13618
13619 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13620
13621 bool
13622 aarch64_sve_ld1r_operand_p (rtx op)
13623 {
13624   struct aarch64_address_info addr;
13625   scalar_mode mode;
13626
13627   return (MEM_P (op)
13628           && is_a <scalar_mode> (GET_MODE (op), &mode)
13629           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13630           && addr.type == ADDRESS_REG_IMM
13631           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13632 }
13633
13634 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13635    The conditions for STR are the same.  */
13636 bool
13637 aarch64_sve_ldr_operand_p (rtx op)
13638 {
13639   struct aarch64_address_info addr;
13640
13641   return (MEM_P (op)
13642           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13643                                        false, ADDR_QUERY_ANY)
13644           && addr.type == ADDRESS_REG_IMM);
13645 }
13646
13647 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13648    We need to be able to access the individual pieces, so the range
13649    is different from LD[234] and ST[234].  */
13650 bool
13651 aarch64_sve_struct_memory_operand_p (rtx op)
13652 {
13653   if (!MEM_P (op))
13654     return false;
13655
13656   machine_mode mode = GET_MODE (op);
13657   struct aarch64_address_info addr;
13658   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13659                                  ADDR_QUERY_ANY)
13660       || addr.type != ADDRESS_REG_IMM)
13661     return false;
13662
13663   poly_int64 first = addr.const_offset;
13664   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13665   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13666           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13667 }
13668
13669 /* Emit a register copy from operand to operand, taking care not to
13670    early-clobber source registers in the process.
13671
13672    COUNT is the number of components into which the copy needs to be
13673    decomposed.  */
13674 void
13675 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13676                                 unsigned int count)
13677 {
13678   unsigned int i;
13679   int rdest = REGNO (operands[0]);
13680   int rsrc = REGNO (operands[1]);
13681
13682   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13683       || rdest < rsrc)
13684     for (i = 0; i < count; i++)
13685       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13686                       gen_rtx_REG (mode, rsrc + i));
13687   else
13688     for (i = 0; i < count; i++)
13689       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13690                       gen_rtx_REG (mode, rsrc + count - i - 1));
13691 }
13692
13693 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13694    one of VSTRUCT modes: OI, CI, or XI.  */
13695 int
13696 aarch64_simd_attr_length_rglist (machine_mode mode)
13697 {
13698   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13699   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13700 }
13701
13702 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13703    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13704    16 bits.  */
13705 static HOST_WIDE_INT
13706 aarch64_simd_vector_alignment (const_tree type)
13707 {
13708   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13709     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13710        be set for non-predicate vectors of booleans.  Modes are the most
13711        direct way we have of identifying real SVE predicate types.  */
13712     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13713   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13714   return MIN (align, 128);
13715 }
13716
13717 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13718 static HOST_WIDE_INT
13719 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13720 {
13721   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13722     {
13723       /* If the length of the vector is fixed, try to align to that length,
13724          otherwise don't try to align at all.  */
13725       HOST_WIDE_INT result;
13726       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13727         result = TYPE_ALIGN (TREE_TYPE (type));
13728       return result;
13729     }
13730   return TYPE_ALIGN (type);
13731 }
13732
13733 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13734 static bool
13735 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13736 {
13737   if (is_packed)
13738     return false;
13739
13740   /* For fixed-length vectors, check that the vectorizer will aim for
13741      full-vector alignment.  This isn't true for generic GCC vectors
13742      that are wider than the ABI maximum of 128 bits.  */
13743   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13744       && (wi::to_widest (TYPE_SIZE (type))
13745           != aarch64_vectorize_preferred_vector_alignment (type)))
13746     return false;
13747
13748   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13749   return true;
13750 }
13751
13752 /* Return true if the vector misalignment factor is supported by the
13753    target.  */
13754 static bool
13755 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13756                                              const_tree type, int misalignment,
13757                                              bool is_packed)
13758 {
13759   if (TARGET_SIMD && STRICT_ALIGNMENT)
13760     {
13761       /* Return if movmisalign pattern is not supported for this mode.  */
13762       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13763         return false;
13764
13765       /* Misalignment factor is unknown at compile time.  */
13766       if (misalignment == -1)
13767         return false;
13768     }
13769   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13770                                                       is_packed);
13771 }
13772
13773 /* If VALS is a vector constant that can be loaded into a register
13774    using DUP, generate instructions to do so and return an RTX to
13775    assign to the register.  Otherwise return NULL_RTX.  */
13776 static rtx
13777 aarch64_simd_dup_constant (rtx vals)
13778 {
13779   machine_mode mode = GET_MODE (vals);
13780   machine_mode inner_mode = GET_MODE_INNER (mode);
13781   rtx x;
13782
13783   if (!const_vec_duplicate_p (vals, &x))
13784     return NULL_RTX;
13785
13786   /* We can load this constant by using DUP and a constant in a
13787      single ARM register.  This will be cheaper than a vector
13788      load.  */
13789   x = copy_to_mode_reg (inner_mode, x);
13790   return gen_vec_duplicate (mode, x);
13791 }
13792
13793
13794 /* Generate code to load VALS, which is a PARALLEL containing only
13795    constants (for vec_init) or CONST_VECTOR, efficiently into a
13796    register.  Returns an RTX to copy into the register, or NULL_RTX
13797    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13798 static rtx
13799 aarch64_simd_make_constant (rtx vals)
13800 {
13801   machine_mode mode = GET_MODE (vals);
13802   rtx const_dup;
13803   rtx const_vec = NULL_RTX;
13804   int n_const = 0;
13805   int i;
13806
13807   if (GET_CODE (vals) == CONST_VECTOR)
13808     const_vec = vals;
13809   else if (GET_CODE (vals) == PARALLEL)
13810     {
13811       /* A CONST_VECTOR must contain only CONST_INTs and
13812          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13813          Only store valid constants in a CONST_VECTOR.  */
13814       int n_elts = XVECLEN (vals, 0);
13815       for (i = 0; i < n_elts; ++i)
13816         {
13817           rtx x = XVECEXP (vals, 0, i);
13818           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13819             n_const++;
13820         }
13821       if (n_const == n_elts)
13822         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13823     }
13824   else
13825     gcc_unreachable ();
13826
13827   if (const_vec != NULL_RTX
13828       && aarch64_simd_valid_immediate (const_vec, NULL))
13829     /* Load using MOVI/MVNI.  */
13830     return const_vec;
13831   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13832     /* Loaded using DUP.  */
13833     return const_dup;
13834   else if (const_vec != NULL_RTX)
13835     /* Load from constant pool. We can not take advantage of single-cycle
13836        LD1 because we need a PC-relative addressing mode.  */
13837     return const_vec;
13838   else
13839     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13840        We can not construct an initializer.  */
13841     return NULL_RTX;
13842 }
13843
13844 /* Expand a vector initialisation sequence, such that TARGET is
13845    initialised to contain VALS.  */
13846
13847 void
13848 aarch64_expand_vector_init (rtx target, rtx vals)
13849 {
13850   machine_mode mode = GET_MODE (target);
13851   scalar_mode inner_mode = GET_MODE_INNER (mode);
13852   /* The number of vector elements.  */
13853   int n_elts = XVECLEN (vals, 0);
13854   /* The number of vector elements which are not constant.  */
13855   int n_var = 0;
13856   rtx any_const = NULL_RTX;
13857   /* The first element of vals.  */
13858   rtx v0 = XVECEXP (vals, 0, 0);
13859   bool all_same = true;
13860
13861   /* Count the number of variable elements to initialise.  */
13862   for (int i = 0; i < n_elts; ++i)
13863     {
13864       rtx x = XVECEXP (vals, 0, i);
13865       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13866         ++n_var;
13867       else
13868         any_const = x;
13869
13870       all_same &= rtx_equal_p (x, v0);
13871     }
13872
13873   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13874      how best to handle this.  */
13875   if (n_var == 0)
13876     {
13877       rtx constant = aarch64_simd_make_constant (vals);
13878       if (constant != NULL_RTX)
13879         {
13880           emit_move_insn (target, constant);
13881           return;
13882         }
13883     }
13884
13885   /* Splat a single non-constant element if we can.  */
13886   if (all_same)
13887     {
13888       rtx x = copy_to_mode_reg (inner_mode, v0);
13889       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13890       return;
13891     }
13892
13893   enum insn_code icode = optab_handler (vec_set_optab, mode);
13894   gcc_assert (icode != CODE_FOR_nothing);
13895
13896   /* If there are only variable elements, try to optimize
13897      the insertion using dup for the most common element
13898      followed by insertions.  */
13899
13900   /* The algorithm will fill matches[*][0] with the earliest matching element,
13901      and matches[X][1] with the count of duplicate elements (if X is the
13902      earliest element which has duplicates).  */
13903
13904   if (n_var == n_elts && n_elts <= 16)
13905     {
13906       int matches[16][2] = {0};
13907       for (int i = 0; i < n_elts; i++)
13908         {
13909           for (int j = 0; j <= i; j++)
13910             {
13911               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13912                 {
13913                   matches[i][0] = j;
13914                   matches[j][1]++;
13915                   break;
13916                 }
13917             }
13918         }
13919       int maxelement = 0;
13920       int maxv = 0;
13921       for (int i = 0; i < n_elts; i++)
13922         if (matches[i][1] > maxv)
13923           {
13924             maxelement = i;
13925             maxv = matches[i][1];
13926           }
13927
13928       /* Create a duplicate of the most common element, unless all elements
13929          are equally useless to us, in which case just immediately set the
13930          vector register using the first element.  */
13931
13932       if (maxv == 1)
13933         {
13934           /* For vectors of two 64-bit elements, we can do even better.  */
13935           if (n_elts == 2
13936               && (inner_mode == E_DImode
13937                   || inner_mode == E_DFmode))
13938
13939             {
13940               rtx x0 = XVECEXP (vals, 0, 0);
13941               rtx x1 = XVECEXP (vals, 0, 1);
13942               /* Combine can pick up this case, but handling it directly
13943                  here leaves clearer RTL.
13944
13945                  This is load_pair_lanes<mode>, and also gives us a clean-up
13946                  for store_pair_lanes<mode>.  */
13947               if (memory_operand (x0, inner_mode)
13948                   && memory_operand (x1, inner_mode)
13949                   && !STRICT_ALIGNMENT
13950                   && rtx_equal_p (XEXP (x1, 0),
13951                                   plus_constant (Pmode,
13952                                                  XEXP (x0, 0),
13953                                                  GET_MODE_SIZE (inner_mode))))
13954                 {
13955                   rtx t;
13956                   if (inner_mode == DFmode)
13957                     t = gen_load_pair_lanesdf (target, x0, x1);
13958                   else
13959                     t = gen_load_pair_lanesdi (target, x0, x1);
13960                   emit_insn (t);
13961                   return;
13962                 }
13963             }
13964           /* The subreg-move sequence below will move into lane zero of the
13965              vector register.  For big-endian we want that position to hold
13966              the last element of VALS.  */
13967           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13968           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13969           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13970         }
13971       else
13972         {
13973           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13974           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13975         }
13976
13977       /* Insert the rest.  */
13978       for (int i = 0; i < n_elts; i++)
13979         {
13980           rtx x = XVECEXP (vals, 0, i);
13981           if (matches[i][0] == maxelement)
13982             continue;
13983           x = copy_to_mode_reg (inner_mode, x);
13984           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13985         }
13986       return;
13987     }
13988
13989   /* Initialise a vector which is part-variable.  We want to first try
13990      to build those lanes which are constant in the most efficient way we
13991      can.  */
13992   if (n_var != n_elts)
13993     {
13994       rtx copy = copy_rtx (vals);
13995
13996       /* Load constant part of vector.  We really don't care what goes into the
13997          parts we will overwrite, but we're more likely to be able to load the
13998          constant efficiently if it has fewer, larger, repeating parts
13999          (see aarch64_simd_valid_immediate).  */
14000       for (int i = 0; i < n_elts; i++)
14001         {
14002           rtx x = XVECEXP (vals, 0, i);
14003           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14004             continue;
14005           rtx subst = any_const;
14006           for (int bit = n_elts / 2; bit > 0; bit /= 2)
14007             {
14008               /* Look in the copied vector, as more elements are const.  */
14009               rtx test = XVECEXP (copy, 0, i ^ bit);
14010               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14011                 {
14012                   subst = test;
14013                   break;
14014                 }
14015             }
14016           XVECEXP (copy, 0, i) = subst;
14017         }
14018       aarch64_expand_vector_init (target, copy);
14019     }
14020
14021   /* Insert the variable lanes directly.  */
14022   for (int i = 0; i < n_elts; i++)
14023     {
14024       rtx x = XVECEXP (vals, 0, i);
14025       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14026         continue;
14027       x = copy_to_mode_reg (inner_mode, x);
14028       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14029     }
14030 }
14031
14032 static unsigned HOST_WIDE_INT
14033 aarch64_shift_truncation_mask (machine_mode mode)
14034 {
14035   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14036     return 0;
14037   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14038 }
14039
14040 /* Select a format to encode pointers in exception handling data.  */
14041 int
14042 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14043 {
14044    int type;
14045    switch (aarch64_cmodel)
14046      {
14047      case AARCH64_CMODEL_TINY:
14048      case AARCH64_CMODEL_TINY_PIC:
14049      case AARCH64_CMODEL_SMALL:
14050      case AARCH64_CMODEL_SMALL_PIC:
14051      case AARCH64_CMODEL_SMALL_SPIC:
14052        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14053           for everything.  */
14054        type = DW_EH_PE_sdata4;
14055        break;
14056      default:
14057        /* No assumptions here.  8-byte relocs required.  */
14058        type = DW_EH_PE_sdata8;
14059        break;
14060      }
14061    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14062 }
14063
14064 /* The last .arch and .tune assembly strings that we printed.  */
14065 static std::string aarch64_last_printed_arch_string;
14066 static std::string aarch64_last_printed_tune_string;
14067
14068 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14069    by the function fndecl.  */
14070
14071 void
14072 aarch64_declare_function_name (FILE *stream, const char* name,
14073                                 tree fndecl)
14074 {
14075   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14076
14077   struct cl_target_option *targ_options;
14078   if (target_parts)
14079     targ_options = TREE_TARGET_OPTION (target_parts);
14080   else
14081     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14082   gcc_assert (targ_options);
14083
14084   const struct processor *this_arch
14085     = aarch64_get_arch (targ_options->x_explicit_arch);
14086
14087   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14088   std::string extension
14089     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14090                                                   this_arch->flags);
14091   /* Only update the assembler .arch string if it is distinct from the last
14092      such string we printed.  */
14093   std::string to_print = this_arch->name + extension;
14094   if (to_print != aarch64_last_printed_arch_string)
14095     {
14096       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14097       aarch64_last_printed_arch_string = to_print;
14098     }
14099
14100   /* Print the cpu name we're tuning for in the comments, might be
14101      useful to readers of the generated asm.  Do it only when it changes
14102      from function to function and verbose assembly is requested.  */
14103   const struct processor *this_tune
14104     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14105
14106   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14107     {
14108       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14109                    this_tune->name);
14110       aarch64_last_printed_tune_string = this_tune->name;
14111     }
14112
14113   /* Don't forget the type directive for ELF.  */
14114   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14115   ASM_OUTPUT_LABEL (stream, name);
14116 }
14117
14118 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14119
14120 static void
14121 aarch64_start_file (void)
14122 {
14123   struct cl_target_option *default_options
14124     = TREE_TARGET_OPTION (target_option_default_node);
14125
14126   const struct processor *default_arch
14127     = aarch64_get_arch (default_options->x_explicit_arch);
14128   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14129   std::string extension
14130     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14131                                                   default_arch->flags);
14132
14133    aarch64_last_printed_arch_string = default_arch->name + extension;
14134    aarch64_last_printed_tune_string = "";
14135    asm_fprintf (asm_out_file, "\t.arch %s\n",
14136                 aarch64_last_printed_arch_string.c_str ());
14137
14138    default_file_start ();
14139 }
14140
14141 /* Emit load exclusive.  */
14142
14143 static void
14144 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14145                              rtx mem, rtx model_rtx)
14146 {
14147   rtx (*gen) (rtx, rtx, rtx);
14148
14149   switch (mode)
14150     {
14151     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14152     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14153     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14154     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14155     default:
14156       gcc_unreachable ();
14157     }
14158
14159   emit_insn (gen (rval, mem, model_rtx));
14160 }
14161
14162 /* Emit store exclusive.  */
14163
14164 static void
14165 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14166                               rtx rval, rtx mem, rtx model_rtx)
14167 {
14168   rtx (*gen) (rtx, rtx, rtx, rtx);
14169
14170   switch (mode)
14171     {
14172     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14173     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14174     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14175     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14176     default:
14177       gcc_unreachable ();
14178     }
14179
14180   emit_insn (gen (bval, rval, mem, model_rtx));
14181 }
14182
14183 /* Mark the previous jump instruction as unlikely.  */
14184
14185 static void
14186 aarch64_emit_unlikely_jump (rtx insn)
14187 {
14188   rtx_insn *jump = emit_jump_insn (insn);
14189   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14190 }
14191
14192 /* Expand a compare and swap pattern.  */
14193
14194 void
14195 aarch64_expand_compare_and_swap (rtx operands[])
14196 {
14197   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14198   machine_mode mode, cmp_mode;
14199   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14200   int idx;
14201   gen_cas_fn gen;
14202   const gen_cas_fn split_cas[] =
14203   {
14204     gen_aarch64_compare_and_swapqi,
14205     gen_aarch64_compare_and_swaphi,
14206     gen_aarch64_compare_and_swapsi,
14207     gen_aarch64_compare_and_swapdi
14208   };
14209   const gen_cas_fn atomic_cas[] =
14210   {
14211     gen_aarch64_compare_and_swapqi_lse,
14212     gen_aarch64_compare_and_swaphi_lse,
14213     gen_aarch64_compare_and_swapsi_lse,
14214     gen_aarch64_compare_and_swapdi_lse
14215   };
14216
14217   bval = operands[0];
14218   rval = operands[1];
14219   mem = operands[2];
14220   oldval = operands[3];
14221   newval = operands[4];
14222   is_weak = operands[5];
14223   mod_s = operands[6];
14224   mod_f = operands[7];
14225   mode = GET_MODE (mem);
14226   cmp_mode = mode;
14227
14228   /* Normally the succ memory model must be stronger than fail, but in the
14229      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14230      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14231
14232   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14233       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14234     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14235
14236   switch (mode)
14237     {
14238     case E_QImode:
14239     case E_HImode:
14240       /* For short modes, we're going to perform the comparison in SImode,
14241          so do the zero-extension now.  */
14242       cmp_mode = SImode;
14243       rval = gen_reg_rtx (SImode);
14244       oldval = convert_modes (SImode, mode, oldval, true);
14245       /* Fall through.  */
14246
14247     case E_SImode:
14248     case E_DImode:
14249       /* Force the value into a register if needed.  */
14250       if (!aarch64_plus_operand (oldval, mode))
14251         oldval = force_reg (cmp_mode, oldval);
14252       break;
14253
14254     default:
14255       gcc_unreachable ();
14256     }
14257
14258   switch (mode)
14259     {
14260     case E_QImode: idx = 0; break;
14261     case E_HImode: idx = 1; break;
14262     case E_SImode: idx = 2; break;
14263     case E_DImode: idx = 3; break;
14264     default:
14265       gcc_unreachable ();
14266     }
14267   if (TARGET_LSE)
14268     gen = atomic_cas[idx];
14269   else
14270     gen = split_cas[idx];
14271
14272   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14273
14274   if (mode == QImode || mode == HImode)
14275     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14276
14277   x = gen_rtx_REG (CCmode, CC_REGNUM);
14278   x = gen_rtx_EQ (SImode, x, const0_rtx);
14279   emit_insn (gen_rtx_SET (bval, x));
14280 }
14281
14282 /* Test whether the target supports using a atomic load-operate instruction.
14283    CODE is the operation and AFTER is TRUE if the data in memory after the
14284    operation should be returned and FALSE if the data before the operation
14285    should be returned.  Returns FALSE if the operation isn't supported by the
14286    architecture.  */
14287
14288 bool
14289 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14290 {
14291   if (!TARGET_LSE)
14292     return false;
14293
14294   switch (code)
14295     {
14296     case SET:
14297     case AND:
14298     case IOR:
14299     case XOR:
14300     case MINUS:
14301     case PLUS:
14302       return true;
14303     default:
14304       return false;
14305     }
14306 }
14307
14308 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14309    sequence implementing an atomic operation.  */
14310
14311 static void
14312 aarch64_emit_post_barrier (enum memmodel model)
14313 {
14314   const enum memmodel base_model = memmodel_base (model);
14315
14316   if (is_mm_sync (model)
14317       && (base_model == MEMMODEL_ACQUIRE
14318           || base_model == MEMMODEL_ACQ_REL
14319           || base_model == MEMMODEL_SEQ_CST))
14320     {
14321       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14322     }
14323 }
14324
14325 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14326    for the data in memory.  EXPECTED is the value expected to be in memory.
14327    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14328    is the memory ordering to use.  */
14329
14330 void
14331 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14332                         rtx expected, rtx desired,
14333                         rtx model)
14334 {
14335   rtx (*gen) (rtx, rtx, rtx, rtx);
14336   machine_mode mode;
14337
14338   mode = GET_MODE (mem);
14339
14340   switch (mode)
14341     {
14342     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14343     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14344     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14345     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14346     default:
14347       gcc_unreachable ();
14348     }
14349
14350   /* Move the expected value into the CAS destination register.  */
14351   emit_insn (gen_rtx_SET (rval, expected));
14352
14353   /* Emit the CAS.  */
14354   emit_insn (gen (rval, mem, desired, model));
14355
14356   /* Compare the expected value with the value loaded by the CAS, to establish
14357      whether the swap was made.  */
14358   aarch64_gen_compare_reg (EQ, rval, expected);
14359 }
14360
14361 /* Split a compare and swap pattern.  */
14362
14363 void
14364 aarch64_split_compare_and_swap (rtx operands[])
14365 {
14366   rtx rval, mem, oldval, newval, scratch;
14367   machine_mode mode;
14368   bool is_weak;
14369   rtx_code_label *label1, *label2;
14370   rtx x, cond;
14371   enum memmodel model;
14372   rtx model_rtx;
14373
14374   rval = operands[0];
14375   mem = operands[1];
14376   oldval = operands[2];
14377   newval = operands[3];
14378   is_weak = (operands[4] != const0_rtx);
14379   model_rtx = operands[5];
14380   scratch = operands[7];
14381   mode = GET_MODE (mem);
14382   model = memmodel_from_int (INTVAL (model_rtx));
14383
14384   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14385     loop:
14386     .label1:
14387         LD[A]XR rval, [mem]
14388         CBNZ    rval, .label2
14389         ST[L]XR scratch, newval, [mem]
14390         CBNZ    scratch, .label1
14391     .label2:
14392         CMP     rval, 0.  */
14393   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14394
14395   label1 = NULL;
14396   if (!is_weak)
14397     {
14398       label1 = gen_label_rtx ();
14399       emit_label (label1);
14400     }
14401   label2 = gen_label_rtx ();
14402
14403   /* The initial load can be relaxed for a __sync operation since a final
14404      barrier will be emitted to stop code hoisting.  */
14405   if (is_mm_sync (model))
14406     aarch64_emit_load_exclusive (mode, rval, mem,
14407                                  GEN_INT (MEMMODEL_RELAXED));
14408   else
14409     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14410
14411   if (strong_zero_p)
14412     {
14413       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14414       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14415                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14416       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14417     }
14418   else
14419     {
14420       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14421       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14422       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14423                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14424       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14425     }
14426
14427   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14428
14429   if (!is_weak)
14430     {
14431       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14432       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14433                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14434       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14435     }
14436   else
14437     {
14438       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14439       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14440       emit_insn (gen_rtx_SET (cond, x));
14441     }
14442
14443   emit_label (label2);
14444   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14445      to set the condition flags.  If this is not used it will be removed by
14446      later passes.  */
14447   if (strong_zero_p)
14448     {
14449       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14450       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14451       emit_insn (gen_rtx_SET (cond, x));
14452     }
14453   /* Emit any final barrier needed for a __sync operation.  */
14454   if (is_mm_sync (model))
14455     aarch64_emit_post_barrier (model);
14456 }
14457
14458 /* Emit a BIC instruction.  */
14459
14460 static void
14461 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14462 {
14463   rtx shift_rtx = GEN_INT (shift);
14464   rtx (*gen) (rtx, rtx, rtx, rtx);
14465
14466   switch (mode)
14467     {
14468     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14469     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14470     default:
14471       gcc_unreachable ();
14472     }
14473
14474   emit_insn (gen (dst, s2, shift_rtx, s1));
14475 }
14476
14477 /* Emit an atomic swap.  */
14478
14479 static void
14480 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14481                           rtx mem, rtx model)
14482 {
14483   rtx (*gen) (rtx, rtx, rtx, rtx);
14484
14485   switch (mode)
14486     {
14487     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14488     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14489     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14490     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14491     default:
14492       gcc_unreachable ();
14493     }
14494
14495   emit_insn (gen (dst, mem, value, model));
14496 }
14497
14498 /* Operations supported by aarch64_emit_atomic_load_op.  */
14499
14500 enum aarch64_atomic_load_op_code
14501 {
14502   AARCH64_LDOP_PLUS,    /* A + B  */
14503   AARCH64_LDOP_XOR,     /* A ^ B  */
14504   AARCH64_LDOP_OR,      /* A | B  */
14505   AARCH64_LDOP_BIC      /* A & ~B  */
14506 };
14507
14508 /* Emit an atomic load-operate.  */
14509
14510 static void
14511 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14512                              machine_mode mode, rtx dst, rtx src,
14513                              rtx mem, rtx model)
14514 {
14515   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14516   const aarch64_atomic_load_op_fn plus[] =
14517   {
14518     gen_aarch64_atomic_loadaddqi,
14519     gen_aarch64_atomic_loadaddhi,
14520     gen_aarch64_atomic_loadaddsi,
14521     gen_aarch64_atomic_loadadddi
14522   };
14523   const aarch64_atomic_load_op_fn eor[] =
14524   {
14525     gen_aarch64_atomic_loadeorqi,
14526     gen_aarch64_atomic_loadeorhi,
14527     gen_aarch64_atomic_loadeorsi,
14528     gen_aarch64_atomic_loadeordi
14529   };
14530   const aarch64_atomic_load_op_fn ior[] =
14531   {
14532     gen_aarch64_atomic_loadsetqi,
14533     gen_aarch64_atomic_loadsethi,
14534     gen_aarch64_atomic_loadsetsi,
14535     gen_aarch64_atomic_loadsetdi
14536   };
14537   const aarch64_atomic_load_op_fn bic[] =
14538   {
14539     gen_aarch64_atomic_loadclrqi,
14540     gen_aarch64_atomic_loadclrhi,
14541     gen_aarch64_atomic_loadclrsi,
14542     gen_aarch64_atomic_loadclrdi
14543   };
14544   aarch64_atomic_load_op_fn gen;
14545   int idx = 0;
14546
14547   switch (mode)
14548     {
14549     case E_QImode: idx = 0; break;
14550     case E_HImode: idx = 1; break;
14551     case E_SImode: idx = 2; break;
14552     case E_DImode: idx = 3; break;
14553     default:
14554       gcc_unreachable ();
14555     }
14556
14557   switch (code)
14558     {
14559     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14560     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14561     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14562     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14563     default:
14564       gcc_unreachable ();
14565     }
14566
14567   emit_insn (gen (dst, mem, src, model));
14568 }
14569
14570 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14571    location to store the data read from memory.  OUT_RESULT is the location to
14572    store the result of the operation.  MEM is the memory location to read and
14573    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14574    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14575    be NULL.  */
14576
14577 void
14578 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14579                          rtx mem, rtx value, rtx model_rtx)
14580 {
14581   machine_mode mode = GET_MODE (mem);
14582   machine_mode wmode = (mode == DImode ? DImode : SImode);
14583   const bool short_mode = (mode < SImode);
14584   aarch64_atomic_load_op_code ldop_code;
14585   rtx src;
14586   rtx x;
14587
14588   if (out_data)
14589     out_data = gen_lowpart (mode, out_data);
14590
14591   if (out_result)
14592     out_result = gen_lowpart (mode, out_result);
14593
14594   /* Make sure the value is in a register, putting it into a destination
14595      register if it needs to be manipulated.  */
14596   if (!register_operand (value, mode)
14597       || code == AND || code == MINUS)
14598     {
14599       src = out_result ? out_result : out_data;
14600       emit_move_insn (src, gen_lowpart (mode, value));
14601     }
14602   else
14603     src = value;
14604   gcc_assert (register_operand (src, mode));
14605
14606   /* Preprocess the data for the operation as necessary.  If the operation is
14607      a SET then emit a swap instruction and finish.  */
14608   switch (code)
14609     {
14610     case SET:
14611       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14612       return;
14613
14614     case MINUS:
14615       /* Negate the value and treat it as a PLUS.  */
14616       {
14617         rtx neg_src;
14618
14619         /* Resize the value if necessary.  */
14620         if (short_mode)
14621           src = gen_lowpart (wmode, src);
14622
14623         neg_src = gen_rtx_NEG (wmode, src);
14624         emit_insn (gen_rtx_SET (src, neg_src));
14625
14626         if (short_mode)
14627           src = gen_lowpart (mode, src);
14628       }
14629       /* Fall-through.  */
14630     case PLUS:
14631       ldop_code = AARCH64_LDOP_PLUS;
14632       break;
14633
14634     case IOR:
14635       ldop_code = AARCH64_LDOP_OR;
14636       break;
14637
14638     case XOR:
14639       ldop_code = AARCH64_LDOP_XOR;
14640       break;
14641
14642     case AND:
14643       {
14644         rtx not_src;
14645
14646         /* Resize the value if necessary.  */
14647         if (short_mode)
14648           src = gen_lowpart (wmode, src);
14649
14650         not_src = gen_rtx_NOT (wmode, src);
14651         emit_insn (gen_rtx_SET (src, not_src));
14652
14653         if (short_mode)
14654           src = gen_lowpart (mode, src);
14655       }
14656       ldop_code = AARCH64_LDOP_BIC;
14657       break;
14658
14659     default:
14660       /* The operation can't be done with atomic instructions.  */
14661       gcc_unreachable ();
14662     }
14663
14664   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14665
14666   /* If necessary, calculate the data in memory after the update by redoing the
14667      operation from values in registers.  */
14668   if (!out_result)
14669     return;
14670
14671   if (short_mode)
14672     {
14673       src = gen_lowpart (wmode, src);
14674       out_data = gen_lowpart (wmode, out_data);
14675       out_result = gen_lowpart (wmode, out_result);
14676     }
14677
14678   x = NULL_RTX;
14679
14680   switch (code)
14681     {
14682     case MINUS:
14683     case PLUS:
14684       x = gen_rtx_PLUS (wmode, out_data, src);
14685       break;
14686     case IOR:
14687       x = gen_rtx_IOR (wmode, out_data, src);
14688       break;
14689     case XOR:
14690       x = gen_rtx_XOR (wmode, out_data, src);
14691       break;
14692     case AND:
14693       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14694       return;
14695     default:
14696       gcc_unreachable ();
14697     }
14698
14699   emit_set_insn (out_result, x);
14700
14701   return;
14702 }
14703
14704 /* Split an atomic operation.  */
14705
14706 void
14707 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14708                          rtx value, rtx model_rtx, rtx cond)
14709 {
14710   machine_mode mode = GET_MODE (mem);
14711   machine_mode wmode = (mode == DImode ? DImode : SImode);
14712   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14713   const bool is_sync = is_mm_sync (model);
14714   rtx_code_label *label;
14715   rtx x;
14716
14717   /* Split the atomic operation into a sequence.  */
14718   label = gen_label_rtx ();
14719   emit_label (label);
14720
14721   if (new_out)
14722     new_out = gen_lowpart (wmode, new_out);
14723   if (old_out)
14724     old_out = gen_lowpart (wmode, old_out);
14725   else
14726     old_out = new_out;
14727   value = simplify_gen_subreg (wmode, value, mode, 0);
14728
14729   /* The initial load can be relaxed for a __sync operation since a final
14730      barrier will be emitted to stop code hoisting.  */
14731  if (is_sync)
14732     aarch64_emit_load_exclusive (mode, old_out, mem,
14733                                  GEN_INT (MEMMODEL_RELAXED));
14734   else
14735     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14736
14737   switch (code)
14738     {
14739     case SET:
14740       new_out = value;
14741       break;
14742
14743     case NOT:
14744       x = gen_rtx_AND (wmode, old_out, value);
14745       emit_insn (gen_rtx_SET (new_out, x));
14746       x = gen_rtx_NOT (wmode, new_out);
14747       emit_insn (gen_rtx_SET (new_out, x));
14748       break;
14749
14750     case MINUS:
14751       if (CONST_INT_P (value))
14752         {
14753           value = GEN_INT (-INTVAL (value));
14754           code = PLUS;
14755         }
14756       /* Fall through.  */
14757
14758     default:
14759       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14760       emit_insn (gen_rtx_SET (new_out, x));
14761       break;
14762     }
14763
14764   aarch64_emit_store_exclusive (mode, cond, mem,
14765                                 gen_lowpart (mode, new_out), model_rtx);
14766
14767   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14768   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14769                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14770   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14771
14772   /* Emit any final barrier needed for a __sync operation.  */
14773   if (is_sync)
14774     aarch64_emit_post_barrier (model);
14775 }
14776
14777 static void
14778 aarch64_init_libfuncs (void)
14779 {
14780    /* Half-precision float operations.  The compiler handles all operations
14781      with NULL libfuncs by converting to SFmode.  */
14782
14783   /* Conversions.  */
14784   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14785   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14786
14787   /* Arithmetic.  */
14788   set_optab_libfunc (add_optab, HFmode, NULL);
14789   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14790   set_optab_libfunc (smul_optab, HFmode, NULL);
14791   set_optab_libfunc (neg_optab, HFmode, NULL);
14792   set_optab_libfunc (sub_optab, HFmode, NULL);
14793
14794   /* Comparisons.  */
14795   set_optab_libfunc (eq_optab, HFmode, NULL);
14796   set_optab_libfunc (ne_optab, HFmode, NULL);
14797   set_optab_libfunc (lt_optab, HFmode, NULL);
14798   set_optab_libfunc (le_optab, HFmode, NULL);
14799   set_optab_libfunc (ge_optab, HFmode, NULL);
14800   set_optab_libfunc (gt_optab, HFmode, NULL);
14801   set_optab_libfunc (unord_optab, HFmode, NULL);
14802 }
14803
14804 /* Target hook for c_mode_for_suffix.  */
14805 static machine_mode
14806 aarch64_c_mode_for_suffix (char suffix)
14807 {
14808   if (suffix == 'q')
14809     return TFmode;
14810
14811   return VOIDmode;
14812 }
14813
14814 /* We can only represent floating point constants which will fit in
14815    "quarter-precision" values.  These values are characterised by
14816    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14817    by:
14818
14819    (-1)^s * (n/16) * 2^r
14820
14821    Where:
14822      's' is the sign bit.
14823      'n' is an integer in the range 16 <= n <= 31.
14824      'r' is an integer in the range -3 <= r <= 4.  */
14825
14826 /* Return true iff X can be represented by a quarter-precision
14827    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14828 bool
14829 aarch64_float_const_representable_p (rtx x)
14830 {
14831   /* This represents our current view of how many bits
14832      make up the mantissa.  */
14833   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14834   int exponent;
14835   unsigned HOST_WIDE_INT mantissa, mask;
14836   REAL_VALUE_TYPE r, m;
14837   bool fail;
14838
14839   if (!CONST_DOUBLE_P (x))
14840     return false;
14841
14842   /* We don't support HFmode constants yet.  */
14843   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14844     return false;
14845
14846   r = *CONST_DOUBLE_REAL_VALUE (x);
14847
14848   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14849      know if we have +zero until we analyse the mantissa, but we
14850      can reject the other invalid values.  */
14851   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14852       || REAL_VALUE_MINUS_ZERO (r))
14853     return false;
14854
14855   /* Extract exponent.  */
14856   r = real_value_abs (&r);
14857   exponent = REAL_EXP (&r);
14858
14859   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14860      highest (sign) bit, with a fixed binary point at bit point_pos.
14861      m1 holds the low part of the mantissa, m2 the high part.
14862      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14863      bits for the mantissa, this can fail (low bits will be lost).  */
14864   real_ldexp (&m, &r, point_pos - exponent);
14865   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14866
14867   /* If the low part of the mantissa has bits set we cannot represent
14868      the value.  */
14869   if (w.ulow () != 0)
14870     return false;
14871   /* We have rejected the lower HOST_WIDE_INT, so update our
14872      understanding of how many bits lie in the mantissa and
14873      look only at the high HOST_WIDE_INT.  */
14874   mantissa = w.elt (1);
14875   point_pos -= HOST_BITS_PER_WIDE_INT;
14876
14877   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14878   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14879   if ((mantissa & mask) != 0)
14880     return false;
14881
14882   /* Having filtered unrepresentable values, we may now remove all
14883      but the highest 5 bits.  */
14884   mantissa >>= point_pos - 5;
14885
14886   /* We cannot represent the value 0.0, so reject it.  This is handled
14887      elsewhere.  */
14888   if (mantissa == 0)
14889     return false;
14890
14891   /* Then, as bit 4 is always set, we can mask it off, leaving
14892      the mantissa in the range [0, 15].  */
14893   mantissa &= ~(1 << 4);
14894   gcc_assert (mantissa <= 15);
14895
14896   /* GCC internally does not use IEEE754-like encoding (where normalized
14897      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14898      Our mantissa values are shifted 4 places to the left relative to
14899      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14900      by 5 places to correct for GCC's representation.  */
14901   exponent = 5 - exponent;
14902
14903   return (exponent >= 0 && exponent <= 7);
14904 }
14905
14906 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14907    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14908    output MOVI/MVNI, ORR or BIC immediate.  */
14909 char*
14910 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14911                                    enum simd_immediate_check which)
14912 {
14913   bool is_valid;
14914   static char templ[40];
14915   const char *mnemonic;
14916   const char *shift_op;
14917   unsigned int lane_count = 0;
14918   char element_char;
14919
14920   struct simd_immediate_info info;
14921
14922   /* This will return true to show const_vector is legal for use as either
14923      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14924      It will also update INFO to show how the immediate should be generated.
14925      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14926   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14927   gcc_assert (is_valid);
14928
14929   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14930   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14931
14932   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14933     {
14934       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14935       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14936          move immediate path.  */
14937       if (aarch64_float_const_zero_rtx_p (info.value))
14938         info.value = GEN_INT (0);
14939       else
14940         {
14941           const unsigned int buf_size = 20;
14942           char float_buf[buf_size] = {'\0'};
14943           real_to_decimal_for_mode (float_buf,
14944                                     CONST_DOUBLE_REAL_VALUE (info.value),
14945                                     buf_size, buf_size, 1, info.elt_mode);
14946
14947           if (lane_count == 1)
14948             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14949           else
14950             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14951                       lane_count, element_char, float_buf);
14952           return templ;
14953         }
14954     }
14955
14956   gcc_assert (CONST_INT_P (info.value));
14957
14958   if (which == AARCH64_CHECK_MOV)
14959     {
14960       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14961       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14962       if (lane_count == 1)
14963         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14964                   mnemonic, UINTVAL (info.value));
14965       else if (info.shift)
14966         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14967                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14968                   element_char, UINTVAL (info.value), shift_op, info.shift);
14969       else
14970         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14971                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14972                   element_char, UINTVAL (info.value));
14973     }
14974   else
14975     {
14976       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14977       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14978       if (info.shift)
14979         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14980                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14981                   element_char, UINTVAL (info.value), "lsl", info.shift);
14982       else
14983         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14984                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14985                   element_char, UINTVAL (info.value));
14986     }
14987   return templ;
14988 }
14989
14990 char*
14991 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14992 {
14993
14994   /* If a floating point number was passed and we desire to use it in an
14995      integer mode do the conversion to integer.  */
14996   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14997     {
14998       unsigned HOST_WIDE_INT ival;
14999       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15000           gcc_unreachable ();
15001       immediate = gen_int_mode (ival, mode);
15002     }
15003
15004   machine_mode vmode;
15005   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15006      a 128 bit vector mode.  */
15007   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15008
15009   vmode = aarch64_simd_container_mode (mode, width);
15010   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15011   return aarch64_output_simd_mov_immediate (v_op, width);
15012 }
15013
15014 /* Return the output string to use for moving immediate CONST_VECTOR
15015    into an SVE register.  */
15016
15017 char *
15018 aarch64_output_sve_mov_immediate (rtx const_vector)
15019 {
15020   static char templ[40];
15021   struct simd_immediate_info info;
15022   char element_char;
15023
15024   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15025   gcc_assert (is_valid);
15026
15027   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15028
15029   if (info.step)
15030     {
15031       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15032                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15033                 element_char, INTVAL (info.value), INTVAL (info.step));
15034       return templ;
15035     }
15036
15037   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15038     {
15039       if (aarch64_float_const_zero_rtx_p (info.value))
15040         info.value = GEN_INT (0);
15041       else
15042         {
15043           const int buf_size = 20;
15044           char float_buf[buf_size] = {};
15045           real_to_decimal_for_mode (float_buf,
15046                                     CONST_DOUBLE_REAL_VALUE (info.value),
15047                                     buf_size, buf_size, 1, info.elt_mode);
15048
15049           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15050                     element_char, float_buf);
15051           return templ;
15052         }
15053     }
15054
15055   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15056             element_char, INTVAL (info.value));
15057   return templ;
15058 }
15059
15060 /* Return the asm format for a PTRUE instruction whose destination has
15061    mode MODE.  SUFFIX is the element size suffix.  */
15062
15063 char *
15064 aarch64_output_ptrue (machine_mode mode, char suffix)
15065 {
15066   unsigned int nunits;
15067   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15068   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15069     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15070   else
15071     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15072   return buf;
15073 }
15074
15075 /* Split operands into moves from op[1] + op[2] into op[0].  */
15076
15077 void
15078 aarch64_split_combinev16qi (rtx operands[3])
15079 {
15080   unsigned int dest = REGNO (operands[0]);
15081   unsigned int src1 = REGNO (operands[1]);
15082   unsigned int src2 = REGNO (operands[2]);
15083   machine_mode halfmode = GET_MODE (operands[1]);
15084   unsigned int halfregs = REG_NREGS (operands[1]);
15085   rtx destlo, desthi;
15086
15087   gcc_assert (halfmode == V16QImode);
15088
15089   if (src1 == dest && src2 == dest + halfregs)
15090     {
15091       /* No-op move.  Can't split to nothing; emit something.  */
15092       emit_note (NOTE_INSN_DELETED);
15093       return;
15094     }
15095
15096   /* Preserve register attributes for variable tracking.  */
15097   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15098   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15099                                GET_MODE_SIZE (halfmode));
15100
15101   /* Special case of reversed high/low parts.  */
15102   if (reg_overlap_mentioned_p (operands[2], destlo)
15103       && reg_overlap_mentioned_p (operands[1], desthi))
15104     {
15105       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15106       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15107       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15108     }
15109   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15110     {
15111       /* Try to avoid unnecessary moves if part of the result
15112          is in the right place already.  */
15113       if (src1 != dest)
15114         emit_move_insn (destlo, operands[1]);
15115       if (src2 != dest + halfregs)
15116         emit_move_insn (desthi, operands[2]);
15117     }
15118   else
15119     {
15120       if (src2 != dest + halfregs)
15121         emit_move_insn (desthi, operands[2]);
15122       if (src1 != dest)
15123         emit_move_insn (destlo, operands[1]);
15124     }
15125 }
15126
15127 /* vec_perm support.  */
15128
15129 struct expand_vec_perm_d
15130 {
15131   rtx target, op0, op1;
15132   vec_perm_indices perm;
15133   machine_mode vmode;
15134   unsigned int vec_flags;
15135   bool one_vector_p;
15136   bool testing_p;
15137 };
15138
15139 /* Generate a variable permutation.  */
15140
15141 static void
15142 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15143 {
15144   machine_mode vmode = GET_MODE (target);
15145   bool one_vector_p = rtx_equal_p (op0, op1);
15146
15147   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15148   gcc_checking_assert (GET_MODE (op0) == vmode);
15149   gcc_checking_assert (GET_MODE (op1) == vmode);
15150   gcc_checking_assert (GET_MODE (sel) == vmode);
15151   gcc_checking_assert (TARGET_SIMD);
15152
15153   if (one_vector_p)
15154     {
15155       if (vmode == V8QImode)
15156         {
15157           /* Expand the argument to a V16QI mode by duplicating it.  */
15158           rtx pair = gen_reg_rtx (V16QImode);
15159           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15160           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15161         }
15162       else
15163         {
15164           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15165         }
15166     }
15167   else
15168     {
15169       rtx pair;
15170
15171       if (vmode == V8QImode)
15172         {
15173           pair = gen_reg_rtx (V16QImode);
15174           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15175           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15176         }
15177       else
15178         {
15179           pair = gen_reg_rtx (OImode);
15180           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15181           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15182         }
15183     }
15184 }
15185
15186 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15187    NELT is the number of elements in the vector.  */
15188
15189 void
15190 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15191                          unsigned int nelt)
15192 {
15193   machine_mode vmode = GET_MODE (target);
15194   bool one_vector_p = rtx_equal_p (op0, op1);
15195   rtx mask;
15196
15197   /* The TBL instruction does not use a modulo index, so we must take care
15198      of that ourselves.  */
15199   mask = aarch64_simd_gen_const_vector_dup (vmode,
15200       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15201   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15202
15203   /* For big-endian, we also need to reverse the index within the vector
15204      (but not which vector).  */
15205   if (BYTES_BIG_ENDIAN)
15206     {
15207       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15208       if (!one_vector_p)
15209         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15210       sel = expand_simple_binop (vmode, XOR, sel, mask,
15211                                  NULL, 0, OPTAB_LIB_WIDEN);
15212     }
15213   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15214 }
15215
15216 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15217
15218 static void
15219 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15220 {
15221   emit_insn (gen_rtx_SET (target,
15222                           gen_rtx_UNSPEC (GET_MODE (target),
15223                                           gen_rtvec (2, op0, op1), code)));
15224 }
15225
15226 /* Expand an SVE vec_perm with the given operands.  */
15227
15228 void
15229 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15230 {
15231   machine_mode data_mode = GET_MODE (target);
15232   machine_mode sel_mode = GET_MODE (sel);
15233   /* Enforced by the pattern condition.  */
15234   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15235
15236   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15237      size of the two value vectors, i.e. the upper bits of the indices
15238      are effectively ignored.  SVE TBL instead produces 0 for any
15239      out-of-range indices, so we need to modulo all the vec_perm indices
15240      to ensure they are all in range.  */
15241   rtx sel_reg = force_reg (sel_mode, sel);
15242
15243   /* Check if the sel only references the first values vector.  */
15244   if (GET_CODE (sel) == CONST_VECTOR
15245       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15246     {
15247       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15248       return;
15249     }
15250
15251   /* Check if the two values vectors are the same.  */
15252   if (rtx_equal_p (op0, op1))
15253     {
15254       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15255       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15256                                          NULL, 0, OPTAB_DIRECT);
15257       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15258       return;
15259     }
15260
15261   /* Run TBL on for each value vector and combine the results.  */
15262
15263   rtx res0 = gen_reg_rtx (data_mode);
15264   rtx res1 = gen_reg_rtx (data_mode);
15265   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15266   if (GET_CODE (sel) != CONST_VECTOR
15267       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15268     {
15269       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15270                                                        2 * nunits - 1);
15271       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15272                                      NULL, 0, OPTAB_DIRECT);
15273     }
15274   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15275   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15276                                      NULL, 0, OPTAB_DIRECT);
15277   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15278   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15279     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15280   else
15281     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15282 }
15283
15284 /* Recognize patterns suitable for the TRN instructions.  */
15285 static bool
15286 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15287 {
15288   HOST_WIDE_INT odd;
15289   poly_uint64 nelt = d->perm.length ();
15290   rtx out, in0, in1, x;
15291   machine_mode vmode = d->vmode;
15292
15293   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15294     return false;
15295
15296   /* Note that these are little-endian tests.
15297      We correct for big-endian later.  */
15298   if (!d->perm[0].is_constant (&odd)
15299       || (odd != 0 && odd != 1)
15300       || !d->perm.series_p (0, 2, odd, 2)
15301       || !d->perm.series_p (1, 2, nelt + odd, 2))
15302     return false;
15303
15304   /* Success!  */
15305   if (d->testing_p)
15306     return true;
15307
15308   in0 = d->op0;
15309   in1 = d->op1;
15310   /* We don't need a big-endian lane correction for SVE; see the comment
15311      at the head of aarch64-sve.md for details.  */
15312   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15313     {
15314       x = in0, in0 = in1, in1 = x;
15315       odd = !odd;
15316     }
15317   out = d->target;
15318
15319   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15320                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15321   return true;
15322 }
15323
15324 /* Recognize patterns suitable for the UZP instructions.  */
15325 static bool
15326 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15327 {
15328   HOST_WIDE_INT odd;
15329   rtx out, in0, in1, x;
15330   machine_mode vmode = d->vmode;
15331
15332   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15333     return false;
15334
15335   /* Note that these are little-endian tests.
15336      We correct for big-endian later.  */
15337   if (!d->perm[0].is_constant (&odd)
15338       || (odd != 0 && odd != 1)
15339       || !d->perm.series_p (0, 1, odd, 2))
15340     return false;
15341
15342   /* Success!  */
15343   if (d->testing_p)
15344     return true;
15345
15346   in0 = d->op0;
15347   in1 = d->op1;
15348   /* We don't need a big-endian lane correction for SVE; see the comment
15349      at the head of aarch64-sve.md for details.  */
15350   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15351     {
15352       x = in0, in0 = in1, in1 = x;
15353       odd = !odd;
15354     }
15355   out = d->target;
15356
15357   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15358                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15359   return true;
15360 }
15361
15362 /* Recognize patterns suitable for the ZIP instructions.  */
15363 static bool
15364 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15365 {
15366   unsigned int high;
15367   poly_uint64 nelt = d->perm.length ();
15368   rtx out, in0, in1, x;
15369   machine_mode vmode = d->vmode;
15370
15371   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15372     return false;
15373
15374   /* Note that these are little-endian tests.
15375      We correct for big-endian later.  */
15376   poly_uint64 first = d->perm[0];
15377   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15378       || !d->perm.series_p (0, 2, first, 1)
15379       || !d->perm.series_p (1, 2, first + nelt, 1))
15380     return false;
15381   high = maybe_ne (first, 0U);
15382
15383   /* Success!  */
15384   if (d->testing_p)
15385     return true;
15386
15387   in0 = d->op0;
15388   in1 = d->op1;
15389   /* We don't need a big-endian lane correction for SVE; see the comment
15390      at the head of aarch64-sve.md for details.  */
15391   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15392     {
15393       x = in0, in0 = in1, in1 = x;
15394       high = !high;
15395     }
15396   out = d->target;
15397
15398   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15399                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15400   return true;
15401 }
15402
15403 /* Recognize patterns for the EXT insn.  */
15404
15405 static bool
15406 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15407 {
15408   HOST_WIDE_INT location;
15409   rtx offset;
15410
15411   /* The first element always refers to the first vector.
15412      Check if the extracted indices are increasing by one.  */
15413   if (d->vec_flags == VEC_SVE_PRED
15414       || !d->perm[0].is_constant (&location)
15415       || !d->perm.series_p (0, 1, location, 1))
15416     return false;
15417
15418   /* Success! */
15419   if (d->testing_p)
15420     return true;
15421
15422   /* The case where (location == 0) is a no-op for both big- and little-endian,
15423      and is removed by the mid-end at optimization levels -O1 and higher.
15424
15425      We don't need a big-endian lane correction for SVE; see the comment
15426      at the head of aarch64-sve.md for details.  */
15427   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15428     {
15429       /* After setup, we want the high elements of the first vector (stored
15430          at the LSB end of the register), and the low elements of the second
15431          vector (stored at the MSB end of the register). So swap.  */
15432       std::swap (d->op0, d->op1);
15433       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15434          to_constant () is safe since this is restricted to Advanced SIMD
15435          vectors.  */
15436       location = d->perm.length ().to_constant () - location;
15437     }
15438
15439   offset = GEN_INT (location);
15440   emit_set_insn (d->target,
15441                  gen_rtx_UNSPEC (d->vmode,
15442                                  gen_rtvec (3, d->op0, d->op1, offset),
15443                                  UNSPEC_EXT));
15444   return true;
15445 }
15446
15447 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15448    within each 64-bit, 32-bit or 16-bit granule.  */
15449
15450 static bool
15451 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15452 {
15453   HOST_WIDE_INT diff;
15454   unsigned int i, size, unspec;
15455   machine_mode pred_mode;
15456
15457   if (d->vec_flags == VEC_SVE_PRED
15458       || !d->one_vector_p
15459       || !d->perm[0].is_constant (&diff))
15460     return false;
15461
15462   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15463   if (size == 8)
15464     {
15465       unspec = UNSPEC_REV64;
15466       pred_mode = VNx2BImode;
15467     }
15468   else if (size == 4)
15469     {
15470       unspec = UNSPEC_REV32;
15471       pred_mode = VNx4BImode;
15472     }
15473   else if (size == 2)
15474     {
15475       unspec = UNSPEC_REV16;
15476       pred_mode = VNx8BImode;
15477     }
15478   else
15479     return false;
15480
15481   unsigned int step = diff + 1;
15482   for (i = 0; i < step; ++i)
15483     if (!d->perm.series_p (i, step, diff - i, step))
15484       return false;
15485
15486   /* Success! */
15487   if (d->testing_p)
15488     return true;
15489
15490   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15491   if (d->vec_flags == VEC_SVE_DATA)
15492     {
15493       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15494       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15495                             UNSPEC_MERGE_PTRUE);
15496     }
15497   emit_set_insn (d->target, src);
15498   return true;
15499 }
15500
15501 /* Recognize patterns for the REV insn, which reverses elements within
15502    a full vector.  */
15503
15504 static bool
15505 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15506 {
15507   poly_uint64 nelt = d->perm.length ();
15508
15509   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15510     return false;
15511
15512   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15513     return false;
15514
15515   /* Success! */
15516   if (d->testing_p)
15517     return true;
15518
15519   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15520   emit_set_insn (d->target, src);
15521   return true;
15522 }
15523
15524 static bool
15525 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15526 {
15527   rtx out = d->target;
15528   rtx in0;
15529   HOST_WIDE_INT elt;
15530   machine_mode vmode = d->vmode;
15531   rtx lane;
15532
15533   if (d->vec_flags == VEC_SVE_PRED
15534       || d->perm.encoding ().encoded_nelts () != 1
15535       || !d->perm[0].is_constant (&elt))
15536     return false;
15537
15538   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15539     return false;
15540
15541   /* Success! */
15542   if (d->testing_p)
15543     return true;
15544
15545   /* The generic preparation in aarch64_expand_vec_perm_const_1
15546      swaps the operand order and the permute indices if it finds
15547      d->perm[0] to be in the second operand.  Thus, we can always
15548      use d->op0 and need not do any extra arithmetic to get the
15549      correct lane number.  */
15550   in0 = d->op0;
15551   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15552
15553   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15554   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15555   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15556   return true;
15557 }
15558
15559 static bool
15560 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15561 {
15562   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15563   machine_mode vmode = d->vmode;
15564
15565   /* Make sure that the indices are constant.  */
15566   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15567   for (unsigned int i = 0; i < encoded_nelts; ++i)
15568     if (!d->perm[i].is_constant ())
15569       return false;
15570
15571   if (d->testing_p)
15572     return true;
15573
15574   /* Generic code will try constant permutation twice.  Once with the
15575      original mode and again with the elements lowered to QImode.
15576      So wait and don't do the selector expansion ourselves.  */
15577   if (vmode != V8QImode && vmode != V16QImode)
15578     return false;
15579
15580   /* to_constant is safe since this routine is specific to Advanced SIMD
15581      vectors.  */
15582   unsigned int nelt = d->perm.length ().to_constant ();
15583   for (unsigned int i = 0; i < nelt; ++i)
15584     /* If big-endian and two vectors we end up with a weird mixed-endian
15585        mode on NEON.  Reverse the index within each word but not the word
15586        itself.  to_constant is safe because we checked is_constant above.  */
15587     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15588                         ? d->perm[i].to_constant () ^ (nelt - 1)
15589                         : d->perm[i].to_constant ());
15590
15591   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15592   sel = force_reg (vmode, sel);
15593
15594   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15595   return true;
15596 }
15597
15598 /* Try to implement D using an SVE TBL instruction.  */
15599
15600 static bool
15601 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15602 {
15603   unsigned HOST_WIDE_INT nelt;
15604
15605   /* Permuting two variable-length vectors could overflow the
15606      index range.  */
15607   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15608     return false;
15609
15610   if (d->testing_p)
15611     return true;
15612
15613   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15614   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15615   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15616   return true;
15617 }
15618
15619 static bool
15620 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15621 {
15622   /* The pattern matching functions above are written to look for a small
15623      number to begin the sequence (0, 1, N/2).  If we begin with an index
15624      from the second operand, we can swap the operands.  */
15625   poly_int64 nelt = d->perm.length ();
15626   if (known_ge (d->perm[0], nelt))
15627     {
15628       d->perm.rotate_inputs (1);
15629       std::swap (d->op0, d->op1);
15630     }
15631
15632   if ((d->vec_flags == VEC_ADVSIMD
15633        || d->vec_flags == VEC_SVE_DATA
15634        || d->vec_flags == VEC_SVE_PRED)
15635       && known_gt (nelt, 1))
15636     {
15637       if (aarch64_evpc_rev_local (d))
15638         return true;
15639       else if (aarch64_evpc_rev_global (d))
15640         return true;
15641       else if (aarch64_evpc_ext (d))
15642         return true;
15643       else if (aarch64_evpc_dup (d))
15644         return true;
15645       else if (aarch64_evpc_zip (d))
15646         return true;
15647       else if (aarch64_evpc_uzp (d))
15648         return true;
15649       else if (aarch64_evpc_trn (d))
15650         return true;
15651       if (d->vec_flags == VEC_SVE_DATA)
15652         return aarch64_evpc_sve_tbl (d);
15653       else if (d->vec_flags == VEC_SVE_DATA)
15654         return aarch64_evpc_tbl (d);
15655     }
15656   return false;
15657 }
15658
15659 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15660
15661 static bool
15662 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15663                                   rtx op1, const vec_perm_indices &sel)
15664 {
15665   struct expand_vec_perm_d d;
15666
15667   /* Check whether the mask can be applied to a single vector.  */
15668   if (op0 && rtx_equal_p (op0, op1))
15669     d.one_vector_p = true;
15670   else if (sel.all_from_input_p (0))
15671     {
15672       d.one_vector_p = true;
15673       op1 = op0;
15674     }
15675   else if (sel.all_from_input_p (1))
15676     {
15677       d.one_vector_p = true;
15678       op0 = op1;
15679     }
15680   else
15681     d.one_vector_p = false;
15682
15683   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15684                      sel.nelts_per_input ());
15685   d.vmode = vmode;
15686   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15687   d.target = target;
15688   d.op0 = op0;
15689   d.op1 = op1;
15690   d.testing_p = !target;
15691
15692   if (!d.testing_p)
15693     return aarch64_expand_vec_perm_const_1 (&d);
15694
15695   rtx_insn *last = get_last_insn ();
15696   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15697   gcc_assert (last == get_last_insn ());
15698
15699   return ret;
15700 }
15701
15702 /* Generate a byte permute mask for a register of mode MODE,
15703    which has NUNITS units.  */
15704
15705 rtx
15706 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15707 {
15708   /* We have to reverse each vector because we dont have
15709      a permuted load that can reverse-load according to ABI rules.  */
15710   rtx mask;
15711   rtvec v = rtvec_alloc (16);
15712   unsigned int i, j;
15713   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15714
15715   gcc_assert (BYTES_BIG_ENDIAN);
15716   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15717
15718   for (i = 0; i < nunits; i++)
15719     for (j = 0; j < usize; j++)
15720       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15721   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15722   return force_reg (V16QImode, mask);
15723 }
15724
15725 /* Return true if X is a valid second operand for the SVE instruction
15726    that implements integer comparison OP_CODE.  */
15727
15728 static bool
15729 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15730 {
15731   if (register_operand (x, VOIDmode))
15732     return true;
15733
15734   switch (op_code)
15735     {
15736     case LTU:
15737     case LEU:
15738     case GEU:
15739     case GTU:
15740       return aarch64_sve_cmp_immediate_p (x, false);
15741     case LT:
15742     case LE:
15743     case GE:
15744     case GT:
15745     case NE:
15746     case EQ:
15747       return aarch64_sve_cmp_immediate_p (x, true);
15748     default:
15749       gcc_unreachable ();
15750     }
15751 }
15752
15753 /* Use predicated SVE instructions to implement the equivalent of:
15754
15755      (set TARGET OP)
15756
15757    given that PTRUE is an all-true predicate of the appropriate mode.  */
15758
15759 static void
15760 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15761 {
15762   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15763                                gen_rtvec (2, ptrue, op),
15764                                UNSPEC_MERGE_PTRUE);
15765   rtx_insn *insn = emit_set_insn (target, unspec);
15766   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15767 }
15768
15769 /* Likewise, but also clobber the condition codes.  */
15770
15771 static void
15772 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15773 {
15774   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15775                                gen_rtvec (2, ptrue, op),
15776                                UNSPEC_MERGE_PTRUE);
15777   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15778   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15779 }
15780
15781 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15782
15783 static unsigned int
15784 aarch64_unspec_cond_code (rtx_code code)
15785 {
15786   switch (code)
15787     {
15788     case NE:
15789       return UNSPEC_COND_NE;
15790     case EQ:
15791       return UNSPEC_COND_EQ;
15792     case LT:
15793       return UNSPEC_COND_LT;
15794     case GT:
15795       return UNSPEC_COND_GT;
15796     case LE:
15797       return UNSPEC_COND_LE;
15798     case GE:
15799       return UNSPEC_COND_GE;
15800     default:
15801       gcc_unreachable ();
15802     }
15803 }
15804
15805 /* Emit:
15806
15807       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15808
15809    where <X> is the operation associated with comparison CODE.  This form
15810    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15811    semantics, such as when PRED might not be all-true and when comparing
15812    inactive lanes could have side effects.  */
15813
15814 static void
15815 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15816                                   rtx pred, rtx op0, rtx op1)
15817 {
15818   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15819                                gen_rtvec (3, pred, op0, op1),
15820                                aarch64_unspec_cond_code (code));
15821   emit_set_insn (target, unspec);
15822 }
15823
15824 /* Expand an SVE integer comparison using the SVE equivalent of:
15825
15826      (set TARGET (CODE OP0 OP1)).  */
15827
15828 void
15829 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15830 {
15831   machine_mode pred_mode = GET_MODE (target);
15832   machine_mode data_mode = GET_MODE (op0);
15833
15834   if (!aarch64_sve_cmp_operand_p (code, op1))
15835     op1 = force_reg (data_mode, op1);
15836
15837   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15838   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15839   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15840 }
15841
15842 /* Emit the SVE equivalent of:
15843
15844       (set TMP1 (CODE1 OP0 OP1))
15845       (set TMP2 (CODE2 OP0 OP1))
15846       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15847
15848    PTRUE is an all-true predicate with the same mode as TARGET.  */
15849
15850 static void
15851 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15852                            rtx ptrue, rtx op0, rtx op1)
15853 {
15854   machine_mode pred_mode = GET_MODE (ptrue);
15855   rtx tmp1 = gen_reg_rtx (pred_mode);
15856   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15857                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15858   rtx tmp2 = gen_reg_rtx (pred_mode);
15859   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15860                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15861   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15862 }
15863
15864 /* Emit the SVE equivalent of:
15865
15866       (set TMP (CODE OP0 OP1))
15867       (set TARGET (not TMP))
15868
15869    PTRUE is an all-true predicate with the same mode as TARGET.  */
15870
15871 static void
15872 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15873                                 rtx op0, rtx op1)
15874 {
15875   machine_mode pred_mode = GET_MODE (ptrue);
15876   rtx tmp = gen_reg_rtx (pred_mode);
15877   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15878                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15879   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15880 }
15881
15882 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15883
15884      (set TARGET (CODE OP0 OP1))
15885
15886    If CAN_INVERT_P is true, the caller can also handle inverted results;
15887    return true if the result is in fact inverted.  */
15888
15889 bool
15890 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15891                                   rtx op0, rtx op1, bool can_invert_p)
15892 {
15893   machine_mode pred_mode = GET_MODE (target);
15894   machine_mode data_mode = GET_MODE (op0);
15895
15896   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15897   switch (code)
15898     {
15899     case UNORDERED:
15900       /* UNORDERED has no immediate form.  */
15901       op1 = force_reg (data_mode, op1);
15902       /* fall through */
15903     case LT:
15904     case LE:
15905     case GT:
15906     case GE:
15907     case EQ:
15908     case NE:
15909       {
15910         /* There is native support for the comparison.  */
15911         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15912         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15913         return false;
15914       }
15915
15916     case LTGT:
15917       /* This is a trapping operation (LT or GT).  */
15918       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15919       return false;
15920
15921     case UNEQ:
15922       if (!flag_trapping_math)
15923         {
15924           /* This would trap for signaling NaNs.  */
15925           op1 = force_reg (data_mode, op1);
15926           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15927           return false;
15928         }
15929       /* fall through */
15930     case UNLT:
15931     case UNLE:
15932     case UNGT:
15933     case UNGE:
15934       if (flag_trapping_math)
15935         {
15936           /* Work out which elements are ordered.  */
15937           rtx ordered = gen_reg_rtx (pred_mode);
15938           op1 = force_reg (data_mode, op1);
15939           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15940
15941           /* Test the opposite condition for the ordered elements,
15942              then invert the result.  */
15943           if (code == UNEQ)
15944             code = NE;
15945           else
15946             code = reverse_condition_maybe_unordered (code);
15947           if (can_invert_p)
15948             {
15949               aarch64_emit_sve_predicated_cond (target, code,
15950                                                 ordered, op0, op1);
15951               return true;
15952             }
15953           rtx tmp = gen_reg_rtx (pred_mode);
15954           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15955           aarch64_emit_unop (target, one_cmpl_optab, tmp);
15956           return false;
15957         }
15958       break;
15959
15960     case ORDERED:
15961       /* ORDERED has no immediate form.  */
15962       op1 = force_reg (data_mode, op1);
15963       break;
15964
15965     default:
15966       gcc_unreachable ();
15967     }
15968
15969   /* There is native support for the inverse comparison.  */
15970   code = reverse_condition_maybe_unordered (code);
15971   if (can_invert_p)
15972     {
15973       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15974       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15975       return true;
15976     }
15977   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15978   return false;
15979 }
15980
15981 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15982    of the data being selected and CMP_MODE is the mode of the values being
15983    compared.  */
15984
15985 void
15986 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15987                           rtx *ops)
15988 {
15989   machine_mode pred_mode
15990     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15991                              GET_MODE_SIZE (cmp_mode)).require ();
15992   rtx pred = gen_reg_rtx (pred_mode);
15993   if (FLOAT_MODE_P (cmp_mode))
15994     {
15995       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15996                                             ops[4], ops[5], true))
15997         std::swap (ops[1], ops[2]);
15998     }
15999   else
16000     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16001
16002   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16003   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16004 }
16005
16006 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16007    true.  However due to issues with register allocation it is preferable
16008    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16009    operations in general registers is better than treating them as scalar
16010    vector operations.  This reduces latency and avoids redundant int<->FP
16011    moves.  So tie modes if they are either the same class, or vector modes
16012    with other vector modes, vector structs or any scalar mode.  */
16013
16014 static bool
16015 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16016 {
16017   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16018     return true;
16019
16020   /* We specifically want to allow elements of "structure" modes to
16021      be tieable to the structure.  This more general condition allows
16022      other rarer situations too.  The reason we don't extend this to
16023      predicate modes is that there are no predicate structure modes
16024      nor any specific instructions for extracting part of a predicate
16025      register.  */
16026   if (aarch64_vector_data_mode_p (mode1)
16027       && aarch64_vector_data_mode_p (mode2))
16028     return true;
16029
16030   /* Also allow any scalar modes with vectors.  */
16031   if (aarch64_vector_mode_supported_p (mode1)
16032       || aarch64_vector_mode_supported_p (mode2))
16033     return true;
16034
16035   return false;
16036 }
16037
16038 /* Return a new RTX holding the result of moving POINTER forward by
16039    AMOUNT bytes.  */
16040
16041 static rtx
16042 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16043 {
16044   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16045
16046   return adjust_automodify_address (pointer, GET_MODE (pointer),
16047                                     next, amount);
16048 }
16049
16050 /* Return a new RTX holding the result of moving POINTER forward by the
16051    size of the mode it points to.  */
16052
16053 static rtx
16054 aarch64_progress_pointer (rtx pointer)
16055 {
16056   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16057 }
16058
16059 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16060    MODE bytes.  */
16061
16062 static void
16063 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16064                                               machine_mode mode)
16065 {
16066   rtx reg = gen_reg_rtx (mode);
16067
16068   /* "Cast" the pointers to the correct mode.  */
16069   *src = adjust_address (*src, mode, 0);
16070   *dst = adjust_address (*dst, mode, 0);
16071   /* Emit the memcpy.  */
16072   emit_move_insn (reg, *src);
16073   emit_move_insn (*dst, reg);
16074   /* Move the pointers forward.  */
16075   *src = aarch64_progress_pointer (*src);
16076   *dst = aarch64_progress_pointer (*dst);
16077 }
16078
16079 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16080    we succeed, otherwise return false.  */
16081
16082 bool
16083 aarch64_expand_movmem (rtx *operands)
16084 {
16085   unsigned int n;
16086   rtx dst = operands[0];
16087   rtx src = operands[1];
16088   rtx base;
16089   bool speed_p = !optimize_function_for_size_p (cfun);
16090
16091   /* When optimizing for size, give a better estimate of the length of a
16092      memcpy call, but use the default otherwise.  */
16093   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16094
16095   /* We can't do anything smart if the amount to copy is not constant.  */
16096   if (!CONST_INT_P (operands[2]))
16097     return false;
16098
16099   n = UINTVAL (operands[2]);
16100
16101   /* Try to keep the number of instructions low.  For cases below 16 bytes we
16102      need to make at most two moves.  For cases above 16 bytes it will be one
16103      move for each 16 byte chunk, then at most two additional moves.  */
16104   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16105     return false;
16106
16107   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16108   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16109
16110   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16111   src = adjust_automodify_address (src, VOIDmode, base, 0);
16112
16113   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16114      1-byte chunk.  */
16115   if (n < 4)
16116     {
16117       if (n >= 2)
16118         {
16119           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16120           n -= 2;
16121         }
16122
16123       if (n == 1)
16124         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16125
16126       return true;
16127     }
16128
16129   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
16130      4-byte chunk, partially overlapping with the previously copied chunk.  */
16131   if (n < 8)
16132     {
16133       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16134       n -= 4;
16135       if (n > 0)
16136         {
16137           int move = n - 4;
16138
16139           src = aarch64_move_pointer (src, move);
16140           dst = aarch64_move_pointer (dst, move);
16141           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16142         }
16143       return true;
16144     }
16145
16146   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
16147      them, then (if applicable) an 8-byte chunk.  */
16148   while (n >= 8)
16149     {
16150       if (n / 16)
16151         {
16152           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16153           n -= 16;
16154         }
16155       else
16156         {
16157           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16158           n -= 8;
16159         }
16160     }
16161
16162   /* Finish the final bytes of the copy.  We can always do this in one
16163      instruction.  We either copy the exact amount we need, or partially
16164      overlap with the previous chunk we copied and copy 8-bytes.  */
16165   if (n == 0)
16166     return true;
16167   else if (n == 1)
16168     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16169   else if (n == 2)
16170     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16171   else if (n == 4)
16172     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16173   else
16174     {
16175       if (n == 3)
16176         {
16177           src = aarch64_move_pointer (src, -1);
16178           dst = aarch64_move_pointer (dst, -1);
16179           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16180         }
16181       else
16182         {
16183           int move = n - 8;
16184
16185           src = aarch64_move_pointer (src, move);
16186           dst = aarch64_move_pointer (dst, move);
16187           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16188         }
16189     }
16190
16191   return true;
16192 }
16193
16194 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16195    SImode stores.  Handle the case when the constant has identical
16196    bottom and top halves.  This is beneficial when the two stores can be
16197    merged into an STP and we avoid synthesising potentially expensive
16198    immediates twice.  Return true if such a split is possible.  */
16199
16200 bool
16201 aarch64_split_dimode_const_store (rtx dst, rtx src)
16202 {
16203   rtx lo = gen_lowpart (SImode, src);
16204   rtx hi = gen_highpart_mode (SImode, DImode, src);
16205
16206   bool size_p = optimize_function_for_size_p (cfun);
16207
16208   if (!rtx_equal_p (lo, hi))
16209     return false;
16210
16211   unsigned int orig_cost
16212     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16213   unsigned int lo_cost
16214     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16215
16216   /* We want to transform:
16217      MOV        x1, 49370
16218      MOVK       x1, 0x140, lsl 16
16219      MOVK       x1, 0xc0da, lsl 32
16220      MOVK       x1, 0x140, lsl 48
16221      STR        x1, [x0]
16222    into:
16223      MOV        w1, 49370
16224      MOVK       w1, 0x140, lsl 16
16225      STP        w1, w1, [x0]
16226    So we want to perform this only when we save two instructions
16227    or more.  When optimizing for size, however, accept any code size
16228    savings we can.  */
16229   if (size_p && orig_cost <= lo_cost)
16230     return false;
16231
16232   if (!size_p
16233       && (orig_cost <= lo_cost + 1))
16234     return false;
16235
16236   rtx mem_lo = adjust_address (dst, SImode, 0);
16237   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16238     return false;
16239
16240   rtx tmp_reg = gen_reg_rtx (SImode);
16241   aarch64_expand_mov_immediate (tmp_reg, lo);
16242   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16243   /* Don't emit an explicit store pair as this may not be always profitable.
16244      Let the sched-fusion logic decide whether to merge them.  */
16245   emit_move_insn (mem_lo, tmp_reg);
16246   emit_move_insn (mem_hi, tmp_reg);
16247
16248   return true;
16249 }
16250
16251 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16252
16253 static unsigned HOST_WIDE_INT
16254 aarch64_asan_shadow_offset (void)
16255 {
16256   return (HOST_WIDE_INT_1 << 36);
16257 }
16258
16259 static rtx
16260 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16261                         int code, tree treeop0, tree treeop1)
16262 {
16263   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16264   rtx op0, op1;
16265   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16266   insn_code icode;
16267   struct expand_operand ops[4];
16268
16269   start_sequence ();
16270   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16271
16272   op_mode = GET_MODE (op0);
16273   if (op_mode == VOIDmode)
16274     op_mode = GET_MODE (op1);
16275
16276   switch (op_mode)
16277     {
16278     case E_QImode:
16279     case E_HImode:
16280     case E_SImode:
16281       cmp_mode = SImode;
16282       icode = CODE_FOR_cmpsi;
16283       break;
16284
16285     case E_DImode:
16286       cmp_mode = DImode;
16287       icode = CODE_FOR_cmpdi;
16288       break;
16289
16290     case E_SFmode:
16291       cmp_mode = SFmode;
16292       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16293       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16294       break;
16295
16296     case E_DFmode:
16297       cmp_mode = DFmode;
16298       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16299       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16300       break;
16301
16302     default:
16303       end_sequence ();
16304       return NULL_RTX;
16305     }
16306
16307   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16308   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16309   if (!op0 || !op1)
16310     {
16311       end_sequence ();
16312       return NULL_RTX;
16313     }
16314   *prep_seq = get_insns ();
16315   end_sequence ();
16316
16317   create_fixed_operand (&ops[0], op0);
16318   create_fixed_operand (&ops[1], op1);
16319
16320   start_sequence ();
16321   if (!maybe_expand_insn (icode, 2, ops))
16322     {
16323       end_sequence ();
16324       return NULL_RTX;
16325     }
16326   *gen_seq = get_insns ();
16327   end_sequence ();
16328
16329   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16330                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16331 }
16332
16333 static rtx
16334 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16335                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16336 {
16337   rtx op0, op1, target;
16338   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16339   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16340   insn_code icode;
16341   struct expand_operand ops[6];
16342   int aarch64_cond;
16343
16344   push_to_sequence (*prep_seq);
16345   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16346
16347   op_mode = GET_MODE (op0);
16348   if (op_mode == VOIDmode)
16349     op_mode = GET_MODE (op1);
16350
16351   switch (op_mode)
16352     {
16353     case E_QImode:
16354     case E_HImode:
16355     case E_SImode:
16356       cmp_mode = SImode;
16357       icode = CODE_FOR_ccmpsi;
16358       break;
16359
16360     case E_DImode:
16361       cmp_mode = DImode;
16362       icode = CODE_FOR_ccmpdi;
16363       break;
16364
16365     case E_SFmode:
16366       cmp_mode = SFmode;
16367       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16368       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16369       break;
16370
16371     case E_DFmode:
16372       cmp_mode = DFmode;
16373       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16374       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16375       break;
16376
16377     default:
16378       end_sequence ();
16379       return NULL_RTX;
16380     }
16381
16382   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16383   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16384   if (!op0 || !op1)
16385     {
16386       end_sequence ();
16387       return NULL_RTX;
16388     }
16389   *prep_seq = get_insns ();
16390   end_sequence ();
16391
16392   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16393   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16394
16395   if (bit_code != AND)
16396     {
16397       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16398                                                 GET_MODE (XEXP (prev, 0))),
16399                              VOIDmode, XEXP (prev, 0), const0_rtx);
16400       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16401     }
16402
16403   create_fixed_operand (&ops[0], XEXP (prev, 0));
16404   create_fixed_operand (&ops[1], target);
16405   create_fixed_operand (&ops[2], op0);
16406   create_fixed_operand (&ops[3], op1);
16407   create_fixed_operand (&ops[4], prev);
16408   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16409
16410   push_to_sequence (*gen_seq);
16411   if (!maybe_expand_insn (icode, 6, ops))
16412     {
16413       end_sequence ();
16414       return NULL_RTX;
16415     }
16416
16417   *gen_seq = get_insns ();
16418   end_sequence ();
16419
16420   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16421 }
16422
16423 #undef TARGET_GEN_CCMP_FIRST
16424 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16425
16426 #undef TARGET_GEN_CCMP_NEXT
16427 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16428
16429 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16430    instruction fusion of some sort.  */
16431
16432 static bool
16433 aarch64_macro_fusion_p (void)
16434 {
16435   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16436 }
16437
16438
16439 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16440    should be kept together during scheduling.  */
16441
16442 static bool
16443 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16444 {
16445   rtx set_dest;
16446   rtx prev_set = single_set (prev);
16447   rtx curr_set = single_set (curr);
16448   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16449   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16450
16451   if (!aarch64_macro_fusion_p ())
16452     return false;
16453
16454   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16455     {
16456       /* We are trying to match:
16457          prev (mov)  == (set (reg r0) (const_int imm16))
16458          curr (movk) == (set (zero_extract (reg r0)
16459                                            (const_int 16)
16460                                            (const_int 16))
16461                              (const_int imm16_1))  */
16462
16463       set_dest = SET_DEST (curr_set);
16464
16465       if (GET_CODE (set_dest) == ZERO_EXTRACT
16466           && CONST_INT_P (SET_SRC (curr_set))
16467           && CONST_INT_P (SET_SRC (prev_set))
16468           && CONST_INT_P (XEXP (set_dest, 2))
16469           && INTVAL (XEXP (set_dest, 2)) == 16
16470           && REG_P (XEXP (set_dest, 0))
16471           && REG_P (SET_DEST (prev_set))
16472           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16473         {
16474           return true;
16475         }
16476     }
16477
16478   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16479     {
16480
16481       /*  We're trying to match:
16482           prev (adrp) == (set (reg r1)
16483                               (high (symbol_ref ("SYM"))))
16484           curr (add) == (set (reg r0)
16485                              (lo_sum (reg r1)
16486                                      (symbol_ref ("SYM"))))
16487           Note that r0 need not necessarily be the same as r1, especially
16488           during pre-regalloc scheduling.  */
16489
16490       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16491           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16492         {
16493           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16494               && REG_P (XEXP (SET_SRC (curr_set), 0))
16495               && REGNO (XEXP (SET_SRC (curr_set), 0))
16496                  == REGNO (SET_DEST (prev_set))
16497               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16498                               XEXP (SET_SRC (curr_set), 1)))
16499             return true;
16500         }
16501     }
16502
16503   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16504     {
16505
16506       /* We're trying to match:
16507          prev (movk) == (set (zero_extract (reg r0)
16508                                            (const_int 16)
16509                                            (const_int 32))
16510                              (const_int imm16_1))
16511          curr (movk) == (set (zero_extract (reg r0)
16512                                            (const_int 16)
16513                                            (const_int 48))
16514                              (const_int imm16_2))  */
16515
16516       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16517           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16518           && REG_P (XEXP (SET_DEST (prev_set), 0))
16519           && REG_P (XEXP (SET_DEST (curr_set), 0))
16520           && REGNO (XEXP (SET_DEST (prev_set), 0))
16521              == REGNO (XEXP (SET_DEST (curr_set), 0))
16522           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16523           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16524           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16525           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16526           && CONST_INT_P (SET_SRC (prev_set))
16527           && CONST_INT_P (SET_SRC (curr_set)))
16528         return true;
16529
16530     }
16531   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16532     {
16533       /* We're trying to match:
16534           prev (adrp) == (set (reg r0)
16535                               (high (symbol_ref ("SYM"))))
16536           curr (ldr) == (set (reg r1)
16537                              (mem (lo_sum (reg r0)
16538                                              (symbol_ref ("SYM")))))
16539                  or
16540           curr (ldr) == (set (reg r1)
16541                              (zero_extend (mem
16542                                            (lo_sum (reg r0)
16543                                                    (symbol_ref ("SYM"))))))  */
16544       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16545           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16546         {
16547           rtx curr_src = SET_SRC (curr_set);
16548
16549           if (GET_CODE (curr_src) == ZERO_EXTEND)
16550             curr_src = XEXP (curr_src, 0);
16551
16552           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16553               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16554               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16555                  == REGNO (SET_DEST (prev_set))
16556               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16557                               XEXP (SET_SRC (prev_set), 0)))
16558               return true;
16559         }
16560     }
16561
16562   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16563        && aarch_crypto_can_dual_issue (prev, curr))
16564     return true;
16565
16566   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16567       && any_condjump_p (curr))
16568     {
16569       enum attr_type prev_type = get_attr_type (prev);
16570
16571       unsigned int condreg1, condreg2;
16572       rtx cc_reg_1;
16573       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16574       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16575
16576       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16577           && prev
16578           && modified_in_p (cc_reg_1, prev))
16579         {
16580           /* FIXME: this misses some which is considered simple arthematic
16581              instructions for ThunderX.  Simple shifts are missed here.  */
16582           if (prev_type == TYPE_ALUS_SREG
16583               || prev_type == TYPE_ALUS_IMM
16584               || prev_type == TYPE_LOGICS_REG
16585               || prev_type == TYPE_LOGICS_IMM)
16586             return true;
16587         }
16588     }
16589
16590   if (prev_set
16591       && curr_set
16592       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16593       && any_condjump_p (curr))
16594     {
16595       /* We're trying to match:
16596           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16597           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16598                                                          (const_int 0))
16599                                                  (label_ref ("SYM"))
16600                                                  (pc))  */
16601       if (SET_DEST (curr_set) == (pc_rtx)
16602           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16603           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16604           && REG_P (SET_DEST (prev_set))
16605           && REGNO (SET_DEST (prev_set))
16606              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16607         {
16608           /* Fuse ALU operations followed by conditional branch instruction.  */
16609           switch (get_attr_type (prev))
16610             {
16611             case TYPE_ALU_IMM:
16612             case TYPE_ALU_SREG:
16613             case TYPE_ADC_REG:
16614             case TYPE_ADC_IMM:
16615             case TYPE_ADCS_REG:
16616             case TYPE_ADCS_IMM:
16617             case TYPE_LOGIC_REG:
16618             case TYPE_LOGIC_IMM:
16619             case TYPE_CSEL:
16620             case TYPE_ADR:
16621             case TYPE_MOV_IMM:
16622             case TYPE_SHIFT_REG:
16623             case TYPE_SHIFT_IMM:
16624             case TYPE_BFM:
16625             case TYPE_RBIT:
16626             case TYPE_REV:
16627             case TYPE_EXTEND:
16628               return true;
16629
16630             default:;
16631             }
16632         }
16633     }
16634
16635   return false;
16636 }
16637
16638 /* Return true iff the instruction fusion described by OP is enabled.  */
16639
16640 bool
16641 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16642 {
16643   return (aarch64_tune_params.fusible_ops & op) != 0;
16644 }
16645
16646 /* If MEM is in the form of [base+offset], extract the two parts
16647    of address and set to BASE and OFFSET, otherwise return false
16648    after clearing BASE and OFFSET.  */
16649
16650 bool
16651 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16652 {
16653   rtx addr;
16654
16655   gcc_assert (MEM_P (mem));
16656
16657   addr = XEXP (mem, 0);
16658
16659   if (REG_P (addr))
16660     {
16661       *base = addr;
16662       *offset = const0_rtx;
16663       return true;
16664     }
16665
16666   if (GET_CODE (addr) == PLUS
16667       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16668     {
16669       *base = XEXP (addr, 0);
16670       *offset = XEXP (addr, 1);
16671       return true;
16672     }
16673
16674   *base = NULL_RTX;
16675   *offset = NULL_RTX;
16676
16677   return false;
16678 }
16679
16680 /* Types for scheduling fusion.  */
16681 enum sched_fusion_type
16682 {
16683   SCHED_FUSION_NONE = 0,
16684   SCHED_FUSION_LD_SIGN_EXTEND,
16685   SCHED_FUSION_LD_ZERO_EXTEND,
16686   SCHED_FUSION_LD,
16687   SCHED_FUSION_ST,
16688   SCHED_FUSION_NUM
16689 };
16690
16691 /* If INSN is a load or store of address in the form of [base+offset],
16692    extract the two parts and set to BASE and OFFSET.  Return scheduling
16693    fusion type this INSN is.  */
16694
16695 static enum sched_fusion_type
16696 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16697 {
16698   rtx x, dest, src;
16699   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16700
16701   gcc_assert (INSN_P (insn));
16702   x = PATTERN (insn);
16703   if (GET_CODE (x) != SET)
16704     return SCHED_FUSION_NONE;
16705
16706   src = SET_SRC (x);
16707   dest = SET_DEST (x);
16708
16709   machine_mode dest_mode = GET_MODE (dest);
16710
16711   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16712     return SCHED_FUSION_NONE;
16713
16714   if (GET_CODE (src) == SIGN_EXTEND)
16715     {
16716       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16717       src = XEXP (src, 0);
16718       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16719         return SCHED_FUSION_NONE;
16720     }
16721   else if (GET_CODE (src) == ZERO_EXTEND)
16722     {
16723       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16724       src = XEXP (src, 0);
16725       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16726         return SCHED_FUSION_NONE;
16727     }
16728
16729   if (GET_CODE (src) == MEM && REG_P (dest))
16730     extract_base_offset_in_addr (src, base, offset);
16731   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16732     {
16733       fusion = SCHED_FUSION_ST;
16734       extract_base_offset_in_addr (dest, base, offset);
16735     }
16736   else
16737     return SCHED_FUSION_NONE;
16738
16739   if (*base == NULL_RTX || *offset == NULL_RTX)
16740     fusion = SCHED_FUSION_NONE;
16741
16742   return fusion;
16743 }
16744
16745 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16746
16747    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16748    and PRI are only calculated for these instructions.  For other instruction,
16749    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16750    type instruction fusion can be added by returning different priorities.
16751
16752    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16753
16754 static void
16755 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16756                                int *fusion_pri, int *pri)
16757 {
16758   int tmp, off_val;
16759   rtx base, offset;
16760   enum sched_fusion_type fusion;
16761
16762   gcc_assert (INSN_P (insn));
16763
16764   tmp = max_pri - 1;
16765   fusion = fusion_load_store (insn, &base, &offset);
16766   if (fusion == SCHED_FUSION_NONE)
16767     {
16768       *pri = tmp;
16769       *fusion_pri = tmp;
16770       return;
16771     }
16772
16773   /* Set FUSION_PRI according to fusion type and base register.  */
16774   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16775
16776   /* Calculate PRI.  */
16777   tmp /= 2;
16778
16779   /* INSN with smaller offset goes first.  */
16780   off_val = (int)(INTVAL (offset));
16781   if (off_val >= 0)
16782     tmp -= (off_val & 0xfffff);
16783   else
16784     tmp += ((- off_val) & 0xfffff);
16785
16786   *pri = tmp;
16787   return;
16788 }
16789
16790 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16791    Adjust priority of sha1h instructions so they are scheduled before
16792    other SHA1 instructions.  */
16793
16794 static int
16795 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16796 {
16797   rtx x = PATTERN (insn);
16798
16799   if (GET_CODE (x) == SET)
16800     {
16801       x = SET_SRC (x);
16802
16803       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16804         return priority + 10;
16805     }
16806
16807   return priority;
16808 }
16809
16810 /* Given OPERANDS of consecutive load/store, check if we can merge
16811    them into ldp/stp.  LOAD is true if they are load instructions.
16812    MODE is the mode of memory operands.  */
16813
16814 bool
16815 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16816                                 machine_mode mode)
16817 {
16818   HOST_WIDE_INT offval_1, offval_2, msize;
16819   enum reg_class rclass_1, rclass_2;
16820   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16821
16822   if (load)
16823     {
16824       mem_1 = operands[1];
16825       mem_2 = operands[3];
16826       reg_1 = operands[0];
16827       reg_2 = operands[2];
16828       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16829       if (REGNO (reg_1) == REGNO (reg_2))
16830         return false;
16831     }
16832   else
16833     {
16834       mem_1 = operands[0];
16835       mem_2 = operands[2];
16836       reg_1 = operands[1];
16837       reg_2 = operands[3];
16838     }
16839
16840   /* The mems cannot be volatile.  */
16841   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16842     return false;
16843
16844   /* If we have SImode and slow unaligned ldp,
16845      check the alignment to be at least 8 byte. */
16846   if (mode == SImode
16847       && (aarch64_tune_params.extra_tuning_flags
16848           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16849       && !optimize_size
16850       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16851     return false;
16852
16853   /* Check if the addresses are in the form of [base+offset].  */
16854   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16855   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16856     return false;
16857   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16858   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16859     return false;
16860
16861   /* Check if the bases are same.  */
16862   if (!rtx_equal_p (base_1, base_2))
16863     return false;
16864
16865   /* The operands must be of the same size.  */
16866   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16867                          GET_MODE_SIZE (GET_MODE (mem_2))));
16868
16869   offval_1 = INTVAL (offset_1);
16870   offval_2 = INTVAL (offset_2);
16871   /* We should only be trying this for fixed-sized modes.  There is no
16872      SVE LDP/STP instruction.  */
16873   msize = GET_MODE_SIZE (mode).to_constant ();
16874   /* Check if the offsets are consecutive.  */
16875   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16876     return false;
16877
16878   /* Check if the addresses are clobbered by load.  */
16879   if (load)
16880     {
16881       if (reg_mentioned_p (reg_1, mem_1))
16882         return false;
16883
16884       /* In increasing order, the last load can clobber the address.  */
16885       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16886         return false;
16887     }
16888
16889   /* One of the memory accesses must be a mempair operand.
16890      If it is not the first one, they need to be swapped by the
16891      peephole.  */
16892   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16893        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16894     return false;
16895
16896   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16897     rclass_1 = FP_REGS;
16898   else
16899     rclass_1 = GENERAL_REGS;
16900
16901   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16902     rclass_2 = FP_REGS;
16903   else
16904     rclass_2 = GENERAL_REGS;
16905
16906   /* Check if the registers are of same class.  */
16907   if (rclass_1 != rclass_2)
16908     return false;
16909
16910   return true;
16911 }
16912
16913 /* Given OPERANDS of consecutive load/store that can be merged,
16914    swap them if they are not in ascending order.  */
16915 void
16916 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16917 {
16918   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16919   HOST_WIDE_INT offval_1, offval_2;
16920
16921   if (load)
16922     {
16923       mem_1 = operands[1];
16924       mem_2 = operands[3];
16925     }
16926   else
16927     {
16928       mem_1 = operands[0];
16929       mem_2 = operands[2];
16930     }
16931
16932   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16933   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16934
16935   offval_1 = INTVAL (offset_1);
16936   offval_2 = INTVAL (offset_2);
16937
16938   if (offval_1 > offval_2)
16939     {
16940       /* Irrespective of whether this is a load or a store,
16941          we do the same swap.  */
16942       std::swap (operands[0], operands[2]);
16943       std::swap (operands[1], operands[3]);
16944     }
16945 }
16946
16947 /* Given OPERANDS of consecutive load/store, check if we can merge
16948    them into ldp/stp by adjusting the offset.  LOAD is true if they
16949    are load instructions.  MODE is the mode of memory operands.
16950
16951    Given below consecutive stores:
16952
16953      str  w1, [xb, 0x100]
16954      str  w1, [xb, 0x104]
16955      str  w1, [xb, 0x108]
16956      str  w1, [xb, 0x10c]
16957
16958    Though the offsets are out of the range supported by stp, we can
16959    still pair them after adjusting the offset, like:
16960
16961      add  scratch, xb, 0x100
16962      stp  w1, w1, [scratch]
16963      stp  w1, w1, [scratch, 0x8]
16964
16965    The peephole patterns detecting this opportunity should guarantee
16966    the scratch register is avaliable.  */
16967
16968 bool
16969 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16970                                        scalar_mode mode)
16971 {
16972   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16973   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16974   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16975   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16976
16977   if (load)
16978     {
16979       reg_1 = operands[0];
16980       mem_1 = operands[1];
16981       reg_2 = operands[2];
16982       mem_2 = operands[3];
16983       reg_3 = operands[4];
16984       mem_3 = operands[5];
16985       reg_4 = operands[6];
16986       mem_4 = operands[7];
16987       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16988                   && REG_P (reg_3) && REG_P (reg_4));
16989       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16990         return false;
16991     }
16992   else
16993     {
16994       mem_1 = operands[0];
16995       reg_1 = operands[1];
16996       mem_2 = operands[2];
16997       reg_2 = operands[3];
16998       mem_3 = operands[4];
16999       reg_3 = operands[5];
17000       mem_4 = operands[6];
17001       reg_4 = operands[7];
17002     }
17003   /* Skip if memory operand is by itslef valid for ldp/stp.  */
17004   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
17005     return false;
17006
17007   /* The mems cannot be volatile.  */
17008   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
17009       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
17010     return false;
17011
17012   /* Check if the addresses are in the form of [base+offset].  */
17013   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17014   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17015     return false;
17016   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17017   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17018     return false;
17019   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17020   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17021     return false;
17022   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17023   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17024     return false;
17025
17026   /* Check if the bases are same.  */
17027   if (!rtx_equal_p (base_1, base_2)
17028       || !rtx_equal_p (base_2, base_3)
17029       || !rtx_equal_p (base_3, base_4))
17030     return false;
17031
17032   offval_1 = INTVAL (offset_1);
17033   offval_2 = INTVAL (offset_2);
17034   offval_3 = INTVAL (offset_3);
17035   offval_4 = INTVAL (offset_4);
17036   msize = GET_MODE_SIZE (mode);
17037   /* Check if the offsets are consecutive.  */
17038   if ((offval_1 != (offval_2 + msize)
17039        || offval_1 != (offval_3 + msize * 2)
17040        || offval_1 != (offval_4 + msize * 3))
17041       && (offval_4 != (offval_3 + msize)
17042           || offval_4 != (offval_2 + msize * 2)
17043           || offval_4 != (offval_1 + msize * 3)))
17044     return false;
17045
17046   /* Check if the addresses are clobbered by load.  */
17047   if (load)
17048     {
17049       if (reg_mentioned_p (reg_1, mem_1)
17050           || reg_mentioned_p (reg_2, mem_2)
17051           || reg_mentioned_p (reg_3, mem_3))
17052         return false;
17053
17054       /* In increasing order, the last load can clobber the address.  */
17055       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
17056         return false;
17057     }
17058
17059   /* If we have SImode and slow unaligned ldp,
17060      check the alignment to be at least 8 byte. */
17061   if (mode == SImode
17062       && (aarch64_tune_params.extra_tuning_flags
17063           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17064       && !optimize_size
17065       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17066     return false;
17067
17068   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17069     rclass_1 = FP_REGS;
17070   else
17071     rclass_1 = GENERAL_REGS;
17072
17073   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17074     rclass_2 = FP_REGS;
17075   else
17076     rclass_2 = GENERAL_REGS;
17077
17078   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17079     rclass_3 = FP_REGS;
17080   else
17081     rclass_3 = GENERAL_REGS;
17082
17083   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17084     rclass_4 = FP_REGS;
17085   else
17086     rclass_4 = GENERAL_REGS;
17087
17088   /* Check if the registers are of same class.  */
17089   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17090     return false;
17091
17092   return true;
17093 }
17094
17095 /* Given OPERANDS of consecutive load/store, this function pairs them
17096    into ldp/stp after adjusting the offset.  It depends on the fact
17097    that addresses of load/store instructions are in increasing order.
17098    MODE is the mode of memory operands.  CODE is the rtl operator
17099    which should be applied to all memory operands, it's SIGN_EXTEND,
17100    ZERO_EXTEND or UNKNOWN.  */
17101
17102 bool
17103 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17104                              scalar_mode mode, RTX_CODE code)
17105 {
17106   rtx base, offset_1, offset_2, t1, t2;
17107   rtx mem_1, mem_2, mem_3, mem_4;
17108   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
17109
17110   if (load)
17111     {
17112       mem_1 = operands[1];
17113       mem_2 = operands[3];
17114     }
17115   else
17116     {
17117       mem_1 = operands[0];
17118       mem_2 = operands[2];
17119     }
17120
17121   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17122   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17123   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17124               && offset_2 != NULL_RTX);
17125
17126   if (INTVAL (offset_1) > INTVAL (offset_2))
17127     {
17128       std::swap (operands[0], operands[6]);
17129       std::swap (operands[1], operands[7]);
17130       std::swap (operands[2], operands[4]);
17131       std::swap (operands[3], operands[5]);
17132     }
17133
17134   if (load)
17135     {
17136       mem_1 = operands[1];
17137       mem_2 = operands[3];
17138       mem_3 = operands[5];
17139       mem_4 = operands[7];
17140     }
17141   else
17142     {
17143       mem_1 = operands[0];
17144       mem_2 = operands[2];
17145       mem_3 = operands[4];
17146       mem_4 = operands[6];
17147       gcc_assert (code == UNKNOWN);
17148     }
17149
17150   /* Extract the offset of the new first address.  */
17151   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17152   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17153
17154   /* Adjust offset thus it can fit in ldp/stp instruction.  */
17155   msize = GET_MODE_SIZE (mode);
17156   stp_off_limit = msize * 0x40;
17157   off_val = INTVAL (offset_1);
17158   abs_off = (off_val < 0) ? -off_val : off_val;
17159   new_off = abs_off % stp_off_limit;
17160   adj_off = abs_off - new_off;
17161
17162   /* Further adjust to make sure all offsets are OK.  */
17163   if ((new_off + msize * 2) >= stp_off_limit)
17164     {
17165       adj_off += stp_off_limit;
17166       new_off -= stp_off_limit;
17167     }
17168
17169   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
17170   if (adj_off >= 0x1000)
17171     return false;
17172
17173   if (off_val < 0)
17174     {
17175       adj_off = -adj_off;
17176       new_off = -new_off;
17177     }
17178
17179   /* Create new memory references.  */
17180   mem_1 = change_address (mem_1, VOIDmode,
17181                           plus_constant (DImode, operands[8], new_off));
17182
17183   /* Check if the adjusted address is OK for ldp/stp.  */
17184   if (!aarch64_mem_pair_operand (mem_1, mode))
17185     return false;
17186
17187   msize = GET_MODE_SIZE (mode);
17188   mem_2 = change_address (mem_2, VOIDmode,
17189                           plus_constant (DImode,
17190                                          operands[8],
17191                                          new_off + msize));
17192   mem_3 = change_address (mem_3, VOIDmode,
17193                           plus_constant (DImode,
17194                                          operands[8],
17195                                          new_off + msize * 2));
17196   mem_4 = change_address (mem_4, VOIDmode,
17197                           plus_constant (DImode,
17198                                          operands[8],
17199                                          new_off + msize * 3));
17200
17201   if (code == ZERO_EXTEND)
17202     {
17203       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17204       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17205       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17206       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17207     }
17208   else if (code == SIGN_EXTEND)
17209     {
17210       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17211       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17212       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17213       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17214     }
17215
17216   if (load)
17217     {
17218       operands[1] = mem_1;
17219       operands[3] = mem_2;
17220       operands[5] = mem_3;
17221       operands[7] = mem_4;
17222     }
17223   else
17224     {
17225       operands[0] = mem_1;
17226       operands[2] = mem_2;
17227       operands[4] = mem_3;
17228       operands[6] = mem_4;
17229     }
17230
17231   /* Emit adjusting instruction.  */
17232   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17233   /* Emit ldp/stp instructions.  */
17234   t1 = gen_rtx_SET (operands[0], operands[1]);
17235   t2 = gen_rtx_SET (operands[2], operands[3]);
17236   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17237   t1 = gen_rtx_SET (operands[4], operands[5]);
17238   t2 = gen_rtx_SET (operands[6], operands[7]);
17239   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17240   return true;
17241 }
17242
17243 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17244    it isn't worth branching around empty masked ops (including masked
17245    stores).  */
17246
17247 static bool
17248 aarch64_empty_mask_is_expensive (unsigned)
17249 {
17250   return false;
17251 }
17252
17253 /* Return 1 if pseudo register should be created and used to hold
17254    GOT address for PIC code.  */
17255
17256 bool
17257 aarch64_use_pseudo_pic_reg (void)
17258 {
17259   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17260 }
17261
17262 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17263
17264 static int
17265 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17266 {
17267   switch (XINT (x, 1))
17268     {
17269     case UNSPEC_GOTSMALLPIC:
17270     case UNSPEC_GOTSMALLPIC28K:
17271     case UNSPEC_GOTTINYPIC:
17272       return 0;
17273     default:
17274       break;
17275     }
17276
17277   return default_unspec_may_trap_p (x, flags);
17278 }
17279
17280
17281 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17282    return the log2 of that value.  Otherwise return -1.  */
17283
17284 int
17285 aarch64_fpconst_pow_of_2 (rtx x)
17286 {
17287   const REAL_VALUE_TYPE *r;
17288
17289   if (!CONST_DOUBLE_P (x))
17290     return -1;
17291
17292   r = CONST_DOUBLE_REAL_VALUE (x);
17293
17294   if (REAL_VALUE_NEGATIVE (*r)
17295       || REAL_VALUE_ISNAN (*r)
17296       || REAL_VALUE_ISINF (*r)
17297       || !real_isinteger (r, DFmode))
17298     return -1;
17299
17300   return exact_log2 (real_to_integer (r));
17301 }
17302
17303 /* If X is a vector of equal CONST_DOUBLE values and that value is
17304    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17305
17306 int
17307 aarch64_vec_fpconst_pow_of_2 (rtx x)
17308 {
17309   int nelts;
17310   if (GET_CODE (x) != CONST_VECTOR
17311       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17312     return -1;
17313
17314   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17315     return -1;
17316
17317   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17318   if (firstval <= 0)
17319     return -1;
17320
17321   for (int i = 1; i < nelts; i++)
17322     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17323       return -1;
17324
17325   return firstval;
17326 }
17327
17328 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17329    to float.
17330
17331    __fp16 always promotes through this hook.
17332    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17333    through the generic excess precision logic rather than here.  */
17334
17335 static tree
17336 aarch64_promoted_type (const_tree t)
17337 {
17338   if (SCALAR_FLOAT_TYPE_P (t)
17339       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17340     return float_type_node;
17341
17342   return NULL_TREE;
17343 }
17344
17345 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17346
17347 static bool
17348 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17349                            optimization_type opt_type)
17350 {
17351   switch (op)
17352     {
17353     case rsqrt_optab:
17354       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17355
17356     default:
17357       return true;
17358     }
17359 }
17360
17361 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17362
17363 static unsigned int
17364 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17365                                         int *offset)
17366 {
17367   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17368   gcc_assert (i == 1);
17369   *factor = 2;
17370   *offset = 1;
17371   return AARCH64_DWARF_VG;
17372 }
17373
17374 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17375    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17376
17377 static bool
17378 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17379 {
17380   return (mode == HFmode
17381           ? true
17382           : default_libgcc_floating_mode_supported_p (mode));
17383 }
17384
17385 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17386    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17387
17388 static bool
17389 aarch64_scalar_mode_supported_p (scalar_mode mode)
17390 {
17391   return (mode == HFmode
17392           ? true
17393           : default_scalar_mode_supported_p (mode));
17394 }
17395
17396 /* Set the value of FLT_EVAL_METHOD.
17397    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17398
17399     0: evaluate all operations and constants, whose semantic type has at
17400        most the range and precision of type float, to the range and
17401        precision of float; evaluate all other operations and constants to
17402        the range and precision of the semantic type;
17403
17404     N, where _FloatN is a supported interchange floating type
17405        evaluate all operations and constants, whose semantic type has at
17406        most the range and precision of _FloatN type, to the range and
17407        precision of the _FloatN type; evaluate all other operations and
17408        constants to the range and precision of the semantic type;
17409
17410    If we have the ARMv8.2-A extensions then we support _Float16 in native
17411    precision, so we should set this to 16.  Otherwise, we support the type,
17412    but want to evaluate expressions in float precision, so set this to
17413    0.  */
17414
17415 static enum flt_eval_method
17416 aarch64_excess_precision (enum excess_precision_type type)
17417 {
17418   switch (type)
17419     {
17420       case EXCESS_PRECISION_TYPE_FAST:
17421       case EXCESS_PRECISION_TYPE_STANDARD:
17422         /* We can calculate either in 16-bit range and precision or
17423            32-bit range and precision.  Make that decision based on whether
17424            we have native support for the ARMv8.2-A 16-bit floating-point
17425            instructions or not.  */
17426         return (TARGET_FP_F16INST
17427                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17428                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17429       case EXCESS_PRECISION_TYPE_IMPLICIT:
17430         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17431       default:
17432         gcc_unreachable ();
17433     }
17434   return FLT_EVAL_METHOD_UNPREDICTABLE;
17435 }
17436
17437 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17438    scheduled for speculative execution.  Reject the long-running division
17439    and square-root instructions.  */
17440
17441 static bool
17442 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17443 {
17444   switch (get_attr_type (insn))
17445     {
17446       case TYPE_SDIV:
17447       case TYPE_UDIV:
17448       case TYPE_FDIVS:
17449       case TYPE_FDIVD:
17450       case TYPE_FSQRTS:
17451       case TYPE_FSQRTD:
17452       case TYPE_NEON_FP_SQRT_S:
17453       case TYPE_NEON_FP_SQRT_D:
17454       case TYPE_NEON_FP_SQRT_S_Q:
17455       case TYPE_NEON_FP_SQRT_D_Q:
17456       case TYPE_NEON_FP_DIV_S:
17457       case TYPE_NEON_FP_DIV_D:
17458       case TYPE_NEON_FP_DIV_S_Q:
17459       case TYPE_NEON_FP_DIV_D_Q:
17460         return false;
17461       default:
17462         return true;
17463     }
17464 }
17465
17466 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17467
17468 static int
17469 aarch64_compute_pressure_classes (reg_class *classes)
17470 {
17471   int i = 0;
17472   classes[i++] = GENERAL_REGS;
17473   classes[i++] = FP_REGS;
17474   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17475      registers need to go in PR_LO_REGS at some point during their
17476      lifetime.  Splitting it into two halves has the effect of making
17477      all predicates count against PR_LO_REGS, so that we try whenever
17478      possible to restrict the number of live predicates to 8.  This
17479      greatly reduces the amount of spilling in certain loops.  */
17480   classes[i++] = PR_LO_REGS;
17481   classes[i++] = PR_HI_REGS;
17482   return i;
17483 }
17484
17485 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17486
17487 static bool
17488 aarch64_can_change_mode_class (machine_mode from,
17489                                machine_mode to, reg_class_t)
17490 {
17491   if (BYTES_BIG_ENDIAN)
17492     {
17493       bool from_sve_p = aarch64_sve_data_mode_p (from);
17494       bool to_sve_p = aarch64_sve_data_mode_p (to);
17495
17496       /* Don't allow changes between SVE data modes and non-SVE modes.
17497          See the comment at the head of aarch64-sve.md for details.  */
17498       if (from_sve_p != to_sve_p)
17499         return false;
17500
17501       /* Don't allow changes in element size: lane 0 of the new vector
17502          would not then be lane 0 of the old vector.  See the comment
17503          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17504          description.
17505
17506          In the worst case, this forces a register to be spilled in
17507          one mode and reloaded in the other, which handles the
17508          endianness correctly.  */
17509       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17510         return false;
17511     }
17512   return true;
17513 }
17514
17515 /* Implement TARGET_EARLY_REMAT_MODES.  */
17516
17517 static void
17518 aarch64_select_early_remat_modes (sbitmap modes)
17519 {
17520   /* SVE values are not normally live across a call, so it should be
17521      worth doing early rematerialization even in VL-specific mode.  */
17522   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17523     {
17524       machine_mode mode = (machine_mode) i;
17525       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17526       if (vec_flags & VEC_ANY_SVE)
17527         bitmap_set_bit (modes, i);
17528     }
17529 }
17530
17531 /* Target-specific selftests.  */
17532
17533 #if CHECKING_P
17534
17535 namespace selftest {
17536
17537 /* Selftest for the RTL loader.
17538    Verify that the RTL loader copes with a dump from
17539    print_rtx_function.  This is essentially just a test that class
17540    function_reader can handle a real dump, but it also verifies
17541    that lookup_reg_by_dump_name correctly handles hard regs.
17542    The presence of hard reg names in the dump means that the test is
17543    target-specific, hence it is in this file.  */
17544
17545 static void
17546 aarch64_test_loading_full_dump ()
17547 {
17548   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17549
17550   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17551
17552   rtx_insn *insn_1 = get_insn_by_uid (1);
17553   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17554
17555   rtx_insn *insn_15 = get_insn_by_uid (15);
17556   ASSERT_EQ (INSN, GET_CODE (insn_15));
17557   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17558
17559   /* Verify crtl->return_rtx.  */
17560   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17561   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17562   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17563 }
17564
17565 /* Run all target-specific selftests.  */
17566
17567 static void
17568 aarch64_run_selftests (void)
17569 {
17570   aarch64_test_loading_full_dump ();
17571 }
17572
17573 } // namespace selftest
17574
17575 #endif /* #if CHECKING_P */
17576
17577 #undef TARGET_ADDRESS_COST
17578 #define TARGET_ADDRESS_COST aarch64_address_cost
17579
17580 /* This hook will determines whether unnamed bitfields affect the alignment
17581    of the containing structure.  The hook returns true if the structure
17582    should inherit the alignment requirements of an unnamed bitfield's
17583    type.  */
17584 #undef TARGET_ALIGN_ANON_BITFIELD
17585 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17586
17587 #undef TARGET_ASM_ALIGNED_DI_OP
17588 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17589
17590 #undef TARGET_ASM_ALIGNED_HI_OP
17591 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17592
17593 #undef TARGET_ASM_ALIGNED_SI_OP
17594 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17595
17596 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17597 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17598   hook_bool_const_tree_hwi_hwi_const_tree_true
17599
17600 #undef TARGET_ASM_FILE_START
17601 #define TARGET_ASM_FILE_START aarch64_start_file
17602
17603 #undef TARGET_ASM_OUTPUT_MI_THUNK
17604 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17605
17606 #undef TARGET_ASM_SELECT_RTX_SECTION
17607 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17608
17609 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17610 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17611
17612 #undef TARGET_BUILD_BUILTIN_VA_LIST
17613 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17614
17615 #undef TARGET_CALLEE_COPIES
17616 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17617
17618 #undef TARGET_CAN_ELIMINATE
17619 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17620
17621 #undef TARGET_CAN_INLINE_P
17622 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17623
17624 #undef TARGET_CANNOT_FORCE_CONST_MEM
17625 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17626
17627 #undef TARGET_CASE_VALUES_THRESHOLD
17628 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17629
17630 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17631 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17632
17633 /* Only the least significant bit is used for initialization guard
17634    variables.  */
17635 #undef TARGET_CXX_GUARD_MASK_BIT
17636 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17637
17638 #undef TARGET_C_MODE_FOR_SUFFIX
17639 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17640
17641 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17642 #undef  TARGET_DEFAULT_TARGET_FLAGS
17643 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17644 #endif
17645
17646 #undef TARGET_CLASS_MAX_NREGS
17647 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17648
17649 #undef TARGET_BUILTIN_DECL
17650 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17651
17652 #undef TARGET_BUILTIN_RECIPROCAL
17653 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17654
17655 #undef TARGET_C_EXCESS_PRECISION
17656 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17657
17658 #undef  TARGET_EXPAND_BUILTIN
17659 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17660
17661 #undef TARGET_EXPAND_BUILTIN_VA_START
17662 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17663
17664 #undef TARGET_FOLD_BUILTIN
17665 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17666
17667 #undef TARGET_FUNCTION_ARG
17668 #define TARGET_FUNCTION_ARG aarch64_function_arg
17669
17670 #undef TARGET_FUNCTION_ARG_ADVANCE
17671 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17672
17673 #undef TARGET_FUNCTION_ARG_BOUNDARY
17674 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17675
17676 #undef TARGET_FUNCTION_ARG_PADDING
17677 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17678
17679 #undef TARGET_GET_RAW_RESULT_MODE
17680 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17681 #undef TARGET_GET_RAW_ARG_MODE
17682 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17683
17684 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17685 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17686
17687 #undef TARGET_FUNCTION_VALUE
17688 #define TARGET_FUNCTION_VALUE aarch64_function_value
17689
17690 #undef TARGET_FUNCTION_VALUE_REGNO_P
17691 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17692
17693 #undef TARGET_GIMPLE_FOLD_BUILTIN
17694 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17695
17696 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17697 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17698
17699 #undef  TARGET_INIT_BUILTINS
17700 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17701
17702 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17703 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17704   aarch64_ira_change_pseudo_allocno_class
17705
17706 #undef TARGET_LEGITIMATE_ADDRESS_P
17707 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17708
17709 #undef TARGET_LEGITIMATE_CONSTANT_P
17710 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17711
17712 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17713 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17714   aarch64_legitimize_address_displacement
17715
17716 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17717 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17718
17719 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17720 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17721 aarch64_libgcc_floating_mode_supported_p
17722
17723 #undef TARGET_MANGLE_TYPE
17724 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17725
17726 #undef TARGET_MEMORY_MOVE_COST
17727 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17728
17729 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17730 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17731
17732 #undef TARGET_MUST_PASS_IN_STACK
17733 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17734
17735 /* This target hook should return true if accesses to volatile bitfields
17736    should use the narrowest mode possible.  It should return false if these
17737    accesses should use the bitfield container type.  */
17738 #undef TARGET_NARROW_VOLATILE_BITFIELD
17739 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17740
17741 #undef  TARGET_OPTION_OVERRIDE
17742 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17743
17744 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17745 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17746   aarch64_override_options_after_change
17747
17748 #undef TARGET_OPTION_SAVE
17749 #define TARGET_OPTION_SAVE aarch64_option_save
17750
17751 #undef TARGET_OPTION_RESTORE
17752 #define TARGET_OPTION_RESTORE aarch64_option_restore
17753
17754 #undef TARGET_OPTION_PRINT
17755 #define TARGET_OPTION_PRINT aarch64_option_print
17756
17757 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17758 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17759
17760 #undef TARGET_SET_CURRENT_FUNCTION
17761 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17762
17763 #undef TARGET_PASS_BY_REFERENCE
17764 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17765
17766 #undef TARGET_PREFERRED_RELOAD_CLASS
17767 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17768
17769 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17770 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17771
17772 #undef TARGET_PROMOTED_TYPE
17773 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17774
17775 #undef TARGET_SECONDARY_RELOAD
17776 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17777
17778 #undef TARGET_SHIFT_TRUNCATION_MASK
17779 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17780
17781 #undef TARGET_SETUP_INCOMING_VARARGS
17782 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17783
17784 #undef TARGET_STRUCT_VALUE_RTX
17785 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17786
17787 #undef TARGET_REGISTER_MOVE_COST
17788 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17789
17790 #undef TARGET_RETURN_IN_MEMORY
17791 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17792
17793 #undef TARGET_RETURN_IN_MSB
17794 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17795
17796 #undef TARGET_RTX_COSTS
17797 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17798
17799 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17800 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17801
17802 #undef TARGET_SCHED_ISSUE_RATE
17803 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17804
17805 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17806 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17807   aarch64_sched_first_cycle_multipass_dfa_lookahead
17808
17809 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17810 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17811   aarch64_first_cycle_multipass_dfa_lookahead_guard
17812
17813 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17814 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17815   aarch64_get_separate_components
17816
17817 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17818 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17819   aarch64_components_for_bb
17820
17821 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17822 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17823   aarch64_disqualify_components
17824
17825 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17826 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17827   aarch64_emit_prologue_components
17828
17829 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17830 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17831   aarch64_emit_epilogue_components
17832
17833 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17834 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17835   aarch64_set_handled_components
17836
17837 #undef TARGET_TRAMPOLINE_INIT
17838 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17839
17840 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17841 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17842
17843 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17844 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17845
17846 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17847 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17848   aarch64_builtin_support_vector_misalignment
17849
17850 #undef TARGET_ARRAY_MODE
17851 #define TARGET_ARRAY_MODE aarch64_array_mode
17852
17853 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17854 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17855
17856 #undef TARGET_VECTORIZE_ADD_STMT_COST
17857 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17858
17859 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17860 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17861   aarch64_builtin_vectorization_cost
17862
17863 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17864 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17865
17866 #undef TARGET_VECTORIZE_BUILTINS
17867 #define TARGET_VECTORIZE_BUILTINS
17868
17869 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17870 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17871   aarch64_builtin_vectorized_function
17872
17873 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17874 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17875   aarch64_autovectorize_vector_sizes
17876
17877 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17878 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17879   aarch64_atomic_assign_expand_fenv
17880
17881 /* Section anchor support.  */
17882
17883 #undef TARGET_MIN_ANCHOR_OFFSET
17884 #define TARGET_MIN_ANCHOR_OFFSET -256
17885
17886 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17887    byte offset; we can do much more for larger data types, but have no way
17888    to determine the size of the access.  We assume accesses are aligned.  */
17889 #undef TARGET_MAX_ANCHOR_OFFSET
17890 #define TARGET_MAX_ANCHOR_OFFSET 4095
17891
17892 #undef TARGET_VECTOR_ALIGNMENT
17893 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17894
17895 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17896 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17897   aarch64_vectorize_preferred_vector_alignment
17898 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17899 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17900   aarch64_simd_vector_alignment_reachable
17901
17902 /* vec_perm support.  */
17903
17904 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17905 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17906   aarch64_vectorize_vec_perm_const
17907
17908 #undef TARGET_VECTORIZE_GET_MASK_MODE
17909 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17910 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17911 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17912   aarch64_empty_mask_is_expensive
17913
17914 #undef TARGET_INIT_LIBFUNCS
17915 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17916
17917 #undef TARGET_FIXED_CONDITION_CODE_REGS
17918 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17919
17920 #undef TARGET_FLAGS_REGNUM
17921 #define TARGET_FLAGS_REGNUM CC_REGNUM
17922
17923 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17924 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17925
17926 #undef TARGET_ASAN_SHADOW_OFFSET
17927 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17928
17929 #undef TARGET_LEGITIMIZE_ADDRESS
17930 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17931
17932 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17933 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17934
17935 #undef TARGET_CAN_USE_DOLOOP_P
17936 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17937
17938 #undef TARGET_SCHED_ADJUST_PRIORITY
17939 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17940
17941 #undef TARGET_SCHED_MACRO_FUSION_P
17942 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17943
17944 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17945 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17946
17947 #undef TARGET_SCHED_FUSION_PRIORITY
17948 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17949
17950 #undef TARGET_UNSPEC_MAY_TRAP_P
17951 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17952
17953 #undef TARGET_USE_PSEUDO_PIC_REG
17954 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17955
17956 #undef TARGET_PRINT_OPERAND
17957 #define TARGET_PRINT_OPERAND aarch64_print_operand
17958
17959 #undef TARGET_PRINT_OPERAND_ADDRESS
17960 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17961
17962 #undef TARGET_OPTAB_SUPPORTED_P
17963 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17964
17965 #undef TARGET_OMIT_STRUCT_RETURN_REG
17966 #define TARGET_OMIT_STRUCT_RETURN_REG true
17967
17968 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17969 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17970   aarch64_dwarf_poly_indeterminate_value
17971
17972 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17973 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17974 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17975
17976 #undef TARGET_HARD_REGNO_NREGS
17977 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17978 #undef TARGET_HARD_REGNO_MODE_OK
17979 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17980
17981 #undef TARGET_MODES_TIEABLE_P
17982 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17983
17984 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17985 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17986   aarch64_hard_regno_call_part_clobbered
17987
17988 #undef TARGET_CONSTANT_ALIGNMENT
17989 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17990
17991 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17992 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17993
17994 #undef TARGET_CAN_CHANGE_MODE_CLASS
17995 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17996
17997 #undef TARGET_SELECT_EARLY_REMAT_MODES
17998 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17999
18000 #if CHECKING_P
18001 #undef TARGET_RUN_TARGET_SELFTESTS
18002 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18003 #endif /* #if CHECKING_P */
18004
18005 struct gcc_target targetm = TARGET_INITIALIZER;
18006
18007 #include "gt-aarch64.h"