gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Support for command line parsing of boolean flags in the tuning
 224    structures.  */
 225 struct aarch64_flag_desc
 226 {
 227   const char* name;
 228   unsigned int flag;
 229 };
 230
 231 #define AARCH64_FUSION_PAIR(name, internal_name) \
 232   { name, AARCH64_FUSE_##internal_name },
 233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 234 {
 235   { "none", AARCH64_FUSE_NOTHING },
 236 #include "aarch64-fusion-pairs.def"
 237   { "all", AARCH64_FUSE_ALL },
 238   { NULL, AARCH64_FUSE_NOTHING }
 239 };
 240
 241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 242   { name, AARCH64_EXTRA_TUNE_##internal_name },
 243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 244 {
 245   { "none", AARCH64_EXTRA_TUNE_NONE },
 246 #include "aarch64-tuning-flags.def"
 247   { "all", AARCH64_EXTRA_TUNE_ALL },
 248   { NULL, AARCH64_EXTRA_TUNE_NONE }
 249 };
 250
 251 /* Tuning parameters.  */
 252
 253 static const struct cpu_addrcost_table generic_addrcost_table =
 254 {
 255     {
 256       1, /* hi  */
 257       0, /* si  */
 258       0, /* di  */
 259       1, /* ti  */
 260     },
 261   0, /* pre_modify  */
 262   0, /* post_modify  */
 263   0, /* register_offset  */
 264   0, /* register_sextend  */
 265   0, /* register_zextend  */
 266   0 /* imm_offset  */
 267 };
 268
 269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 270 {
 271     {
 272       0, /* hi  */
 273       0, /* si  */
 274       0, /* di  */
 275       2, /* ti  */
 276     },
 277   0, /* pre_modify  */
 278   0, /* post_modify  */
 279   1, /* register_offset  */
 280   1, /* register_sextend  */
 281   2, /* register_zextend  */
 282   0, /* imm_offset  */
 283 };
 284
 285 static const struct cpu_addrcost_table xgene1_addrcost_table =
 286 {
 287     {
 288       1, /* hi  */
 289       0, /* si  */
 290       0, /* di  */
 291       1, /* ti  */
 292     },
 293   1, /* pre_modify  */
 294   0, /* post_modify  */
 295   0, /* register_offset  */
 296   1, /* register_sextend  */
 297   1, /* register_zextend  */
 298   0, /* imm_offset  */
 299 };
 300
 301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 302 {
 303     {
 304       1, /* hi  */
 305       1, /* si  */
 306       1, /* di  */
 307       2, /* ti  */
 308     },
 309   0, /* pre_modify  */
 310   0, /* post_modify  */
 311   2, /* register_offset  */
 312   3, /* register_sextend  */
 313   3, /* register_zextend  */
 314   0, /* imm_offset  */
 315 };
 316
 317 static const struct cpu_regmove_cost generic_regmove_cost =
 318 {
 319   1, /* GP2GP  */
 320   /* Avoid the use of slow int<->fp moves for spilling by setting
 321      their cost higher than memmov_cost.  */
 322   5, /* GP2FP  */
 323   5, /* FP2GP  */
 324   2 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of slow int<->fp moves for spilling by setting
 331      their cost higher than memmov_cost.  */
 332   5, /* GP2FP  */
 333   5, /* FP2GP  */
 334   2 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   5, /* GP2FP  */
 343   5, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 348 {
 349   1, /* GP2GP  */
 350   /* Avoid the use of slow int<->fp moves for spilling by setting
 351      their cost higher than memmov_cost (actual, 4 and 9).  */
 352   9, /* GP2FP  */
 353   9, /* FP2GP  */
 354   1 /* FP2FP  */
 355 };
 356
 357 static const struct cpu_regmove_cost thunderx_regmove_cost =
 358 {
 359   2, /* GP2GP  */
 360   2, /* GP2FP  */
 361   6, /* FP2GP  */
 362   4 /* FP2FP  */
 363 };
 364
 365 static const struct cpu_regmove_cost xgene1_regmove_cost =
 366 {
 367   1, /* GP2GP  */
 368   /* Avoid the use of slow int<->fp moves for spilling by setting
 369      their cost higher than memmov_cost.  */
 370   8, /* GP2FP  */
 371   8, /* FP2GP  */
 372   2 /* FP2FP  */
 373 };
 374
 375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 376 {
 377   2, /* GP2GP  */
 378   /* Avoid the use of int<->fp moves for spilling.  */
 379   6, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of int<->fp moves for spilling.  */
 388   8, /* GP2FP  */
 389   8, /* FP2GP  */
 390   4  /* FP2FP  */
 391 };
 392
 393 /* Generic costs for vector insn classes.  */
 394 static const struct cpu_vector_cost generic_vector_cost =
 395 {
 396   1, /* scalar_int_stmt_cost  */
 397   1, /* scalar_fp_stmt_cost  */
 398   1, /* scalar_load_cost  */
 399   1, /* scalar_store_cost  */
 400   1, /* vec_int_stmt_cost  */
 401   1, /* vec_fp_stmt_cost  */
 402   2, /* vec_permute_cost  */
 403   1, /* vec_to_scalar_cost  */
 404   1, /* scalar_to_vec_cost  */
 405   1, /* vec_align_load_cost  */
 406   1, /* vec_unalign_load_cost  */
 407   1, /* vec_unalign_store_cost  */
 408   1, /* vec_store_cost  */
 409   3, /* cond_taken_branch_cost  */
 410   1 /* cond_not_taken_branch_cost  */
 411 };
 412
 413 /* ThunderX costs for vector insn classes.  */
 414 static const struct cpu_vector_cost thunderx_vector_cost =
 415 {
 416   1, /* scalar_int_stmt_cost  */
 417   1, /* scalar_fp_stmt_cost  */
 418   3, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   4, /* vec_int_stmt_cost  */
 421   1, /* vec_fp_stmt_cost  */
 422   4, /* vec_permute_cost  */
 423   2, /* vec_to_scalar_cost  */
 424   2, /* scalar_to_vec_cost  */
 425   3, /* vec_align_load_cost  */
 426   5, /* vec_unalign_load_cost  */
 427   5, /* vec_unalign_store_cost  */
 428   1, /* vec_store_cost  */
 429   3, /* cond_taken_branch_cost  */
 430   3 /* cond_not_taken_branch_cost  */
 431 };
 432
 433 /* Generic costs for vector insn classes.  */
 434 static const struct cpu_vector_cost cortexa57_vector_cost =
 435 {
 436   1, /* scalar_int_stmt_cost  */
 437   1, /* scalar_fp_stmt_cost  */
 438   4, /* scalar_load_cost  */
 439   1, /* scalar_store_cost  */
 440   2, /* vec_int_stmt_cost  */
 441   2, /* vec_fp_stmt_cost  */
 442   3, /* vec_permute_cost  */
 443   8, /* vec_to_scalar_cost  */
 444   8, /* scalar_to_vec_cost  */
 445   4, /* vec_align_load_cost  */
 446   4, /* vec_unalign_load_cost  */
 447   1, /* vec_unalign_store_cost  */
 448   1, /* vec_store_cost  */
 449   1, /* cond_taken_branch_cost  */
 450   1 /* cond_not_taken_branch_cost  */
 451 };
 452
 453 static const struct cpu_vector_cost exynosm1_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   5, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   3, /* vec_int_stmt_cost  */
 460   3, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   3, /* vec_to_scalar_cost  */
 463   3, /* scalar_to_vec_cost  */
 464   5, /* vec_align_load_cost  */
 465   5, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 /* Generic costs for vector insn classes.  */
 473 static const struct cpu_vector_cost xgene1_vector_cost =
 474 {
 475   1, /* scalar_int_stmt_cost  */
 476   1, /* scalar_fp_stmt_cost  */
 477   5, /* scalar_load_cost  */
 478   1, /* scalar_store_cost  */
 479   2, /* vec_int_stmt_cost  */
 480   2, /* vec_fp_stmt_cost  */
 481   2, /* vec_permute_cost  */
 482   4, /* vec_to_scalar_cost  */
 483   4, /* scalar_to_vec_cost  */
 484   10, /* vec_align_load_cost  */
 485   10, /* vec_unalign_load_cost  */
 486   2, /* vec_unalign_store_cost  */
 487   2, /* vec_store_cost  */
 488   2, /* cond_taken_branch_cost  */
 489   1 /* cond_not_taken_branch_cost  */
 490 };
 491
 492 /* Costs for vector insn classes for Vulcan.  */
 493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 494 {
 495   1, /* scalar_int_stmt_cost  */
 496   6, /* scalar_fp_stmt_cost  */
 497   4, /* scalar_load_cost  */
 498   1, /* scalar_store_cost  */
 499   5, /* vec_int_stmt_cost  */
 500   6, /* vec_fp_stmt_cost  */
 501   3, /* vec_permute_cost  */
 502   6, /* vec_to_scalar_cost  */
 503   5, /* scalar_to_vec_cost  */
 504   8, /* vec_align_load_cost  */
 505   8, /* vec_unalign_load_cost  */
 506   4, /* vec_unalign_store_cost  */
 507   4, /* vec_store_cost  */
 508   2, /* cond_taken_branch_cost  */
 509   1  /* cond_not_taken_branch_cost  */
 510 };
 511
 512 /* Generic costs for branch instructions.  */
 513 static const struct cpu_branch_cost generic_branch_cost =
 514 {
 515   1,  /* Predictable.  */
 516   3   /* Unpredictable.  */
 517 };
 518
 519 /* Generic approximation modes.  */
 520 static const cpu_approx_modes generic_approx_modes =
 521 {
 522   AARCH64_APPROX_NONE,  /* division  */
 523   AARCH64_APPROX_NONE,  /* sqrt  */
 524   AARCH64_APPROX_NONE   /* recip_sqrt  */
 525 };
 526
 527 /* Approximation modes for Exynos M1.  */
 528 static const cpu_approx_modes exynosm1_approx_modes =
 529 {
 530   AARCH64_APPROX_NONE,  /* division  */
 531   AARCH64_APPROX_ALL,   /* sqrt  */
 532   AARCH64_APPROX_ALL    /* recip_sqrt  */
 533 };
 534
 535 /* Approximation modes for X-Gene 1.  */
 536 static const cpu_approx_modes xgene1_approx_modes =
 537 {
 538   AARCH64_APPROX_NONE,  /* division  */
 539   AARCH64_APPROX_NONE,  /* sqrt  */
 540   AARCH64_APPROX_ALL    /* recip_sqrt  */
 541 };
 542
 543 /* Generic prefetch settings (which disable prefetch).  */
 544 static const cpu_prefetch_tune generic_prefetch_tune =
 545 {
 546   0,                    /* num_slots  */
 547   -1,                   /* l1_cache_size  */
 548   -1,                   /* l1_cache_line_size  */
 549   -1,                   /* l2_cache_size  */
 550   -1                    /* default_opt_level  */
 551 };
 552
 553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 554 {
 555   0,                    /* num_slots  */
 556   -1,                   /* l1_cache_size  */
 557   64,                   /* l1_cache_line_size  */
 558   -1,                   /* l2_cache_size  */
 559   -1                    /* default_opt_level  */
 560 };
 561
 562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 563 {
 564   4,                    /* num_slots  */
 565   32,                   /* l1_cache_size  */
 566   64,                   /* l1_cache_line_size  */
 567   512,                  /* l2_cache_size  */
 568   -1                    /* default_opt_level  */
 569 };
 570
 571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 572 {
 573   8,                    /* num_slots  */
 574   32,                   /* l1_cache_size  */
 575   128,                  /* l1_cache_line_size  */
 576   16*1024,              /* l2_cache_size  */
 577   3                     /* default_opt_level  */
 578 };
 579
 580 static const cpu_prefetch_tune thunderx_prefetch_tune =
 581 {
 582   8,                    /* num_slots  */
 583   32,                   /* l1_cache_size  */
 584   128,                  /* l1_cache_line_size  */
 585   -1,                   /* l2_cache_size  */
 586   -1                    /* default_opt_level  */
 587 };
 588
 589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 590 {
 591   8,                    /* num_slots  */
 592   32,                   /* l1_cache_size  */
 593   64,                   /* l1_cache_line_size  */
 594   256,                  /* l2_cache_size  */
 595   -1                    /* default_opt_level  */
 596 };
 597
 598 static const struct tune_params generic_tunings =
 599 {
 600   &cortexa57_extra_costs,
 601   &generic_addrcost_table,
 602   &generic_regmove_cost,
 603   &generic_vector_cost,
 604   &generic_branch_cost,
 605   &generic_approx_modes,
 606   4, /* memmov_cost  */
 607   2, /* issue_rate  */
 608   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 609   8,    /* function_align.  */
 610   4,    /* jump_align.  */
 611   8,    /* loop_align.  */
 612   2,    /* int_reassoc_width.  */
 613   4,    /* fp_reassoc_width.  */
 614   1,    /* vec_reassoc_width.  */
 615   2,    /* min_div_recip_mul_sf.  */
 616   2,    /* min_div_recip_mul_df.  */
 617   0,    /* max_case_values.  */
 618   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 619   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 620   &generic_prefetch_tune
 621 };
 622
 623 static const struct tune_params cortexa35_tunings =
 624 {
 625   &cortexa53_extra_costs,
 626   &generic_addrcost_table,
 627   &cortexa53_regmove_cost,
 628   &generic_vector_cost,
 629   &generic_branch_cost,
 630   &generic_approx_modes,
 631   4, /* memmov_cost  */
 632   1, /* issue_rate  */
 633   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 634    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 635   16,   /* function_align.  */
 636   4,    /* jump_align.  */
 637   8,    /* loop_align.  */
 638   2,    /* int_reassoc_width.  */
 639   4,    /* fp_reassoc_width.  */
 640   1,    /* vec_reassoc_width.  */
 641   2,    /* min_div_recip_mul_sf.  */
 642   2,    /* min_div_recip_mul_df.  */
 643   0,    /* max_case_values.  */
 644   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 645   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 646   &generic_prefetch_tune
 647 };
 648
 649 static const struct tune_params cortexa53_tunings =
 650 {
 651   &cortexa53_extra_costs,
 652   &generic_addrcost_table,
 653   &cortexa53_regmove_cost,
 654   &generic_vector_cost,
 655   &generic_branch_cost,
 656   &generic_approx_modes,
 657   4, /* memmov_cost  */
 658   2, /* issue_rate  */
 659   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 660    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 661   16,   /* function_align.  */
 662   4,    /* jump_align.  */
 663   8,    /* loop_align.  */
 664   2,    /* int_reassoc_width.  */
 665   4,    /* fp_reassoc_width.  */
 666   1,    /* vec_reassoc_width.  */
 667   2,    /* min_div_recip_mul_sf.  */
 668   2,    /* min_div_recip_mul_df.  */
 669   0,    /* max_case_values.  */
 670   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 671   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 672   &generic_prefetch_tune
 673 };
 674
 675 static const struct tune_params cortexa57_tunings =
 676 {
 677   &cortexa57_extra_costs,
 678   &generic_addrcost_table,
 679   &cortexa57_regmove_cost,
 680   &cortexa57_vector_cost,
 681   &generic_branch_cost,
 682   &generic_approx_modes,
 683   4, /* memmov_cost  */
 684   3, /* issue_rate  */
 685   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 686    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 687   16,   /* function_align.  */
 688   4,    /* jump_align.  */
 689   8,    /* loop_align.  */
 690   2,    /* int_reassoc_width.  */
 691   4,    /* fp_reassoc_width.  */
 692   1,    /* vec_reassoc_width.  */
 693   2,    /* min_div_recip_mul_sf.  */
 694   2,    /* min_div_recip_mul_df.  */
 695   0,    /* max_case_values.  */
 696   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 697   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 698   &generic_prefetch_tune
 699 };
 700
 701 static const struct tune_params cortexa72_tunings =
 702 {
 703   &cortexa57_extra_costs,
 704   &generic_addrcost_table,
 705   &cortexa57_regmove_cost,
 706   &cortexa57_vector_cost,
 707   &generic_branch_cost,
 708   &generic_approx_modes,
 709   4, /* memmov_cost  */
 710   3, /* issue_rate  */
 711   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 712    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 713   16,   /* function_align.  */
 714   4,    /* jump_align.  */
 715   8,    /* loop_align.  */
 716   2,    /* int_reassoc_width.  */
 717   4,    /* fp_reassoc_width.  */
 718   1,    /* vec_reassoc_width.  */
 719   2,    /* min_div_recip_mul_sf.  */
 720   2,    /* min_div_recip_mul_df.  */
 721   0,    /* max_case_values.  */
 722   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 723   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 724   &generic_prefetch_tune
 725 };
 726
 727 static const struct tune_params cortexa73_tunings =
 728 {
 729   &cortexa57_extra_costs,
 730   &generic_addrcost_table,
 731   &cortexa57_regmove_cost,
 732   &cortexa57_vector_cost,
 733   &generic_branch_cost,
 734   &generic_approx_modes,
 735   4, /* memmov_cost.  */
 736   2, /* issue_rate.  */
 737   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 738    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 739   16,   /* function_align.  */
 740   4,    /* jump_align.  */
 741   8,    /* loop_align.  */
 742   2,    /* int_reassoc_width.  */
 743   4,    /* fp_reassoc_width.  */
 744   1,    /* vec_reassoc_width.  */
 745   2,    /* min_div_recip_mul_sf.  */
 746   2,    /* min_div_recip_mul_df.  */
 747   0,    /* max_case_values.  */
 748   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 749   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 750   &generic_prefetch_tune
 751 };
 752
 753
 754
 755 static const struct tune_params exynosm1_tunings =
 756 {
 757   &exynosm1_extra_costs,
 758   &exynosm1_addrcost_table,
 759   &exynosm1_regmove_cost,
 760   &exynosm1_vector_cost,
 761   &generic_branch_cost,
 762   &exynosm1_approx_modes,
 763   4,    /* memmov_cost  */
 764   3,    /* issue_rate  */
 765   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 766   4,    /* function_align.  */
 767   4,    /* jump_align.  */
 768   4,    /* loop_align.  */
 769   2,    /* int_reassoc_width.  */
 770   4,    /* fp_reassoc_width.  */
 771   1,    /* vec_reassoc_width.  */
 772   2,    /* min_div_recip_mul_sf.  */
 773   2,    /* min_div_recip_mul_df.  */
 774   48,   /* max_case_values.  */
 775   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 776   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 777   &exynosm1_prefetch_tune
 778 };
 779
 780 static const struct tune_params thunderxt88_tunings =
 781 {
 782   &thunderx_extra_costs,
 783   &generic_addrcost_table,
 784   &thunderx_regmove_cost,
 785   &thunderx_vector_cost,
 786   &generic_branch_cost,
 787   &generic_approx_modes,
 788   6, /* memmov_cost  */
 789   2, /* issue_rate  */
 790   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 791   8,    /* function_align.  */
 792   8,    /* jump_align.  */
 793   8,    /* loop_align.  */
 794   2,    /* int_reassoc_width.  */
 795   4,    /* fp_reassoc_width.  */
 796   1,    /* vec_reassoc_width.  */
 797   2,    /* min_div_recip_mul_sf.  */
 798   2,    /* min_div_recip_mul_df.  */
 799   0,    /* max_case_values.  */
 800   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 801   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 802   &thunderxt88_prefetch_tune
 803 };
 804
 805 static const struct tune_params thunderx_tunings =
 806 {
 807   &thunderx_extra_costs,
 808   &generic_addrcost_table,
 809   &thunderx_regmove_cost,
 810   &thunderx_vector_cost,
 811   &generic_branch_cost,
 812   &generic_approx_modes,
 813   6, /* memmov_cost  */
 814   2, /* issue_rate  */
 815   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 816   8,    /* function_align.  */
 817   8,    /* jump_align.  */
 818   8,    /* loop_align.  */
 819   2,    /* int_reassoc_width.  */
 820   4,    /* fp_reassoc_width.  */
 821   1,    /* vec_reassoc_width.  */
 822   2,    /* min_div_recip_mul_sf.  */
 823   2,    /* min_div_recip_mul_df.  */
 824   0,    /* max_case_values.  */
 825   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 826   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 827    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 828   &thunderx_prefetch_tune
 829 };
 830
 831 static const struct tune_params xgene1_tunings =
 832 {
 833   &xgene1_extra_costs,
 834   &xgene1_addrcost_table,
 835   &xgene1_regmove_cost,
 836   &xgene1_vector_cost,
 837   &generic_branch_cost,
 838   &xgene1_approx_modes,
 839   6, /* memmov_cost  */
 840   4, /* issue_rate  */
 841   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 842   16,   /* function_align.  */
 843   8,    /* jump_align.  */
 844   16,   /* loop_align.  */
 845   2,    /* int_reassoc_width.  */
 846   4,    /* fp_reassoc_width.  */
 847   1,    /* vec_reassoc_width.  */
 848   2,    /* min_div_recip_mul_sf.  */
 849   2,    /* min_div_recip_mul_df.  */
 850   0,    /* max_case_values.  */
 851   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 852   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 853   &generic_prefetch_tune
 854 };
 855
 856 static const struct tune_params qdf24xx_tunings =
 857 {
 858   &qdf24xx_extra_costs,
 859   &generic_addrcost_table,
 860   &qdf24xx_regmove_cost,
 861   &generic_vector_cost,
 862   &generic_branch_cost,
 863   &generic_approx_modes,
 864   4, /* memmov_cost  */
 865   4, /* issue_rate  */
 866   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 867    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 868   16,   /* function_align.  */
 869   8,    /* jump_align.  */
 870   16,   /* loop_align.  */
 871   2,    /* int_reassoc_width.  */
 872   4,    /* fp_reassoc_width.  */
 873   1,    /* vec_reassoc_width.  */
 874   2,    /* min_div_recip_mul_sf.  */
 875   2,    /* min_div_recip_mul_df.  */
 876   0,    /* max_case_values.  */
 877   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 878   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 879   &qdf24xx_prefetch_tune
 880 };
 881
 882 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 883    for now.  */
 884 static const struct tune_params saphira_tunings =
 885 {
 886   &generic_extra_costs,
 887   &generic_addrcost_table,
 888   &generic_regmove_cost,
 889   &generic_vector_cost,
 890   &generic_branch_cost,
 891   &generic_approx_modes,
 892   4, /* memmov_cost  */
 893   4, /* issue_rate  */
 894   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 895    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 896   16,   /* function_align.  */
 897   8,    /* jump_align.  */
 898   16,   /* loop_align.  */
 899   2,    /* int_reassoc_width.  */
 900   4,    /* fp_reassoc_width.  */
 901   1,    /* vec_reassoc_width.  */
 902   2,    /* min_div_recip_mul_sf.  */
 903   2,    /* min_div_recip_mul_df.  */
 904   0,    /* max_case_values.  */
 905   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 906   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 907   &generic_prefetch_tune
 908 };
 909
 910 static const struct tune_params thunderx2t99_tunings =
 911 {
 912   &thunderx2t99_extra_costs,
 913   &thunderx2t99_addrcost_table,
 914   &thunderx2t99_regmove_cost,
 915   &thunderx2t99_vector_cost,
 916   &generic_branch_cost,
 917   &generic_approx_modes,
 918   4, /* memmov_cost.  */
 919   4, /* issue_rate.  */
 920   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 921    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 922   16,   /* function_align.  */
 923   8,    /* jump_align.  */
 924   16,   /* loop_align.  */
 925   3,    /* int_reassoc_width.  */
 926   2,    /* fp_reassoc_width.  */
 927   2,    /* vec_reassoc_width.  */
 928   2,    /* min_div_recip_mul_sf.  */
 929   2,    /* min_div_recip_mul_df.  */
 930   0,    /* max_case_values.  */
 931   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 932   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 933   &thunderx2t99_prefetch_tune
 934 };
 935
 936 /* Support for fine-grained override of the tuning structures.  */
 937 struct aarch64_tuning_override_function
 938 {
 939   const char* name;
 940   void (*parse_override)(const char*, struct tune_params*);
 941 };
 942
 943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 945
 946 static const struct aarch64_tuning_override_function
 947 aarch64_tuning_override_functions[] =
 948 {
 949   { "fuse", aarch64_parse_fuse_string },
 950   { "tune", aarch64_parse_tune_string },
 951   { NULL, NULL }
 952 };
 953
 954 /* A processor implementing AArch64.  */
 955 struct processor
 956 {
 957   const char *const name;
 958   enum aarch64_processor ident;
 959   enum aarch64_processor sched_core;
 960   enum aarch64_arch arch;
 961   unsigned architecture_version;
 962   const unsigned long flags;
 963   const struct tune_params *const tune;
 964 };
 965
 966 /* Architectures implementing AArch64.  */
 967 static const struct processor all_architectures[] =
 968 {
 969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 970   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 971 #include "aarch64-arches.def"
 972   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 973 };
 974
 975 /* Processor cores implementing AArch64.  */
 976 static const struct processor all_cores[] =
 977 {
 978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 979   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 980   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 981   FLAGS, &COSTS##_tunings},
 982 #include "aarch64-cores.def"
 983   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 984     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 985   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 986 };
 987
 988
 989 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 990    handling code or by target attributes.  */
 991 static const struct processor *selected_arch;
 992 static const struct processor *selected_cpu;
 993 static const struct processor *selected_tune;
 994
 995 /* The current tuning set.  */
 996 struct tune_params aarch64_tune_params = generic_tunings;
 997
 998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 999
1000 /* An ISA extension in the co-processor and main instruction set space.  */
1001 struct aarch64_option_extension
1002 {
1003   const char *const name;
1004   const unsigned long flags_on;
1005   const unsigned long flags_off;
1006 };
1007
1008 typedef enum aarch64_cond_code
1009 {
1010   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1013 }
1014 aarch64_cc;
1015
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1017
1018 /* The condition codes of the processor, and the inverse function.  */
1019 static const char * const aarch64_condition_codes[] =
1020 {
1021   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1023 };
1024
1025 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028                         const char * branch_format)
1029 {
1030     rtx_code_label * tmp_label = gen_label_rtx ();
1031     char label_buf[256];
1032     char buffer[128];
1033     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034                                  CODE_LABEL_NUMBER (tmp_label));
1035     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036     rtx dest_label = operands[pos_label];
1037     operands[pos_label] = tmp_label;
1038
1039     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040     output_asm_insn (buffer, operands);
1041
1042     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043     operands[pos_label] = dest_label;
1044     output_asm_insn (buffer, operands);
1045     return "";
1046 }
1047
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1050 {
1051   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052   if (TARGET_GENERAL_REGS_ONLY)
1053     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054   else
1055     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1056 }
1057
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1061    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1063    irrespectively of its cost results in bad allocations with many redundant
1064    int<->FP moves which are expensive on various cores.
1065    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1067    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1068    Otherwise set the allocno class depending on the mode.
1069    The result of this is that it is no longer inefficient to have a higher
1070    memory move cost than the register move cost.
1071 */
1072
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075                                          reg_class_t best_class)
1076 {
1077   machine_mode mode;
1078
1079   if (allocno_class != ALL_REGS)
1080     return allocno_class;
1081
1082   if (best_class != ALL_REGS)
1083     return best_class;
1084
1085   mode = PSEUDO_REGNO_MODE (regno);
1086   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1087 }
1088
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1091 {
1092   if (GET_MODE_UNIT_SIZE (mode) == 4)
1093     return aarch64_tune_params.min_div_recip_mul_sf;
1094   return aarch64_tune_params.min_div_recip_mul_df;
1095 }
1096
1097 /* Return the reassociation width of treeop OPC with mode MODE.  */
1098 static int
1099 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1100 {
1101   if (VECTOR_MODE_P (mode))
1102     return aarch64_tune_params.vec_reassoc_width;
1103   if (INTEGRAL_MODE_P (mode))
1104     return aarch64_tune_params.int_reassoc_width;
1105   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1106   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1107     return aarch64_tune_params.fp_reassoc_width;
1108   return 1;
1109 }
1110
1111 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1112 unsigned
1113 aarch64_dbx_register_number (unsigned regno)
1114 {
1115    if (GP_REGNUM_P (regno))
1116      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1117    else if (regno == SP_REGNUM)
1118      return AARCH64_DWARF_SP;
1119    else if (FP_REGNUM_P (regno))
1120      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1121    else if (PR_REGNUM_P (regno))
1122      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1123    else if (regno == VG_REGNUM)
1124      return AARCH64_DWARF_VG;
1125
1126    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1127       equivalent DWARF register.  */
1128    return DWARF_FRAME_REGISTERS;
1129 }
1130
1131 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1132 static bool
1133 aarch64_advsimd_struct_mode_p (machine_mode mode)
1134 {
1135   return (TARGET_SIMD
1136           && (mode == OImode || mode == CImode || mode == XImode));
1137 }
1138
1139 /* Return true if MODE is an SVE predicate mode.  */
1140 static bool
1141 aarch64_sve_pred_mode_p (machine_mode mode)
1142 {
1143   return (TARGET_SVE
1144           && (mode == VNx16BImode
1145               || mode == VNx8BImode
1146               || mode == VNx4BImode
1147               || mode == VNx2BImode));
1148 }
1149
1150 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1151 const unsigned int VEC_ADVSIMD  = 1;
1152 const unsigned int VEC_SVE_DATA = 2;
1153 const unsigned int VEC_SVE_PRED = 4;
1154 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1155    a structure of 2, 3 or 4 vectors.  */
1156 const unsigned int VEC_STRUCT   = 8;
1157 /* Useful combinations of the above.  */
1158 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1159 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1160
1161 /* Return a set of flags describing the vector properties of mode MODE.
1162    Ignore modes that are not supported by the current target.  */
1163 static unsigned int
1164 aarch64_classify_vector_mode (machine_mode mode)
1165 {
1166   if (aarch64_advsimd_struct_mode_p (mode))
1167     return VEC_ADVSIMD | VEC_STRUCT;
1168
1169   if (aarch64_sve_pred_mode_p (mode))
1170     return VEC_SVE_PRED;
1171
1172   scalar_mode inner = GET_MODE_INNER (mode);
1173   if (VECTOR_MODE_P (mode)
1174       && (inner == QImode
1175           || inner == HImode
1176           || inner == HFmode
1177           || inner == SImode
1178           || inner == SFmode
1179           || inner == DImode
1180           || inner == DFmode))
1181     {
1182       if (TARGET_SVE)
1183         {
1184           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1185             return VEC_SVE_DATA;
1186           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1187               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1188               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1189             return VEC_SVE_DATA | VEC_STRUCT;
1190         }
1191
1192       /* This includes V1DF but not V1DI (which doesn't exist).  */
1193       if (TARGET_SIMD
1194           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1195               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1196         return VEC_ADVSIMD;
1197     }
1198
1199   return 0;
1200 }
1201
1202 /* Return true if MODE is any of the data vector modes, including
1203    structure modes.  */
1204 static bool
1205 aarch64_vector_data_mode_p (machine_mode mode)
1206 {
1207   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1208 }
1209
1210 /* Return true if MODE is an SVE data vector mode; either a single vector
1211    or a structure of vectors.  */
1212 static bool
1213 aarch64_sve_data_mode_p (machine_mode mode)
1214 {
1215   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1216 }
1217
1218 /* Implement target hook TARGET_ARRAY_MODE.  */
1219 static opt_machine_mode
1220 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1221 {
1222   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1223       && IN_RANGE (nelems, 2, 4))
1224     return mode_for_vector (GET_MODE_INNER (mode),
1225                             GET_MODE_NUNITS (mode) * nelems);
1226
1227   return opt_machine_mode ();
1228 }
1229
1230 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1231 static bool
1232 aarch64_array_mode_supported_p (machine_mode mode,
1233                                 unsigned HOST_WIDE_INT nelems)
1234 {
1235   if (TARGET_SIMD
1236       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1237           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1238       && (nelems >= 2 && nelems <= 4))
1239     return true;
1240
1241   return false;
1242 }
1243
1244 /* Return the SVE predicate mode to use for elements that have
1245    ELEM_NBYTES bytes, if such a mode exists.  */
1246
1247 opt_machine_mode
1248 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1249 {
1250   if (TARGET_SVE)
1251     {
1252       if (elem_nbytes == 1)
1253         return VNx16BImode;
1254       if (elem_nbytes == 2)
1255         return VNx8BImode;
1256       if (elem_nbytes == 4)
1257         return VNx4BImode;
1258       if (elem_nbytes == 8)
1259         return VNx2BImode;
1260     }
1261   return opt_machine_mode ();
1262 }
1263
1264 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1265
1266 static opt_machine_mode
1267 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1268 {
1269   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1270     {
1271       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1272       machine_mode pred_mode;
1273       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1274         return pred_mode;
1275     }
1276
1277   return default_get_mask_mode (nunits, nbytes);
1278 }
1279
1280 /* Implement TARGET_HARD_REGNO_NREGS.  */
1281
1282 static unsigned int
1283 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1284 {
1285   /* ??? Logically we should only need to provide a value when
1286      HARD_REGNO_MODE_OK says that the combination is valid,
1287      but at the moment we need to handle all modes.  Just ignore
1288      any runtime parts for registers that can't store them.  */
1289   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1290   switch (aarch64_regno_regclass (regno))
1291     {
1292     case FP_REGS:
1293     case FP_LO_REGS:
1294       if (aarch64_sve_data_mode_p (mode))
1295         return exact_div (GET_MODE_SIZE (mode),
1296                           BYTES_PER_SVE_VECTOR).to_constant ();
1297       return CEIL (lowest_size, UNITS_PER_VREG);
1298     case PR_REGS:
1299     case PR_LO_REGS:
1300     case PR_HI_REGS:
1301       return 1;
1302     default:
1303       return CEIL (lowest_size, UNITS_PER_WORD);
1304     }
1305   gcc_unreachable ();
1306 }
1307
1308 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1309
1310 static bool
1311 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1312 {
1313   if (GET_MODE_CLASS (mode) == MODE_CC)
1314     return regno == CC_REGNUM;
1315
1316   if (regno == VG_REGNUM)
1317     /* This must have the same size as _Unwind_Word.  */
1318     return mode == DImode;
1319
1320   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1321   if (vec_flags & VEC_SVE_PRED)
1322     return PR_REGNUM_P (regno);
1323
1324   if (PR_REGNUM_P (regno))
1325     return 0;
1326
1327   if (regno == SP_REGNUM)
1328     /* The purpose of comparing with ptr_mode is to support the
1329        global register variable associated with the stack pointer
1330        register via the syntax of asm ("wsp") in ILP32.  */
1331     return mode == Pmode || mode == ptr_mode;
1332
1333   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1334     return mode == Pmode;
1335
1336   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1337     return true;
1338
1339   if (FP_REGNUM_P (regno))
1340     {
1341       if (vec_flags & VEC_STRUCT)
1342         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1343       else
1344         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1345     }
1346
1347   return false;
1348 }
1349
1350 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1351    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1352    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1353
1354 static bool
1355 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1356 {
1357   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1358 }
1359
1360 /* Implement REGMODE_NATURAL_SIZE.  */
1361 poly_uint64
1362 aarch64_regmode_natural_size (machine_mode mode)
1363 {
1364   /* The natural size for SVE data modes is one SVE data vector,
1365      and similarly for predicates.  We can't independently modify
1366      anything smaller than that.  */
1367   /* ??? For now, only do this for variable-width SVE registers.
1368      Doing it for constant-sized registers breaks lower-subreg.c.  */
1369   /* ??? And once that's fixed, we should probably have similar
1370      code for Advanced SIMD.  */
1371   if (!aarch64_sve_vg.is_constant ())
1372     {
1373       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1374       if (vec_flags & VEC_SVE_PRED)
1375         return BYTES_PER_SVE_PRED;
1376       if (vec_flags & VEC_SVE_DATA)
1377         return BYTES_PER_SVE_VECTOR;
1378     }
1379   return UNITS_PER_WORD;
1380 }
1381
1382 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1383 machine_mode
1384 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1385                                      machine_mode mode)
1386 {
1387   /* The predicate mode determines which bits are significant and
1388      which are "don't care".  Decreasing the number of lanes would
1389      lose data while increasing the number of lanes would make bits
1390      unnecessarily significant.  */
1391   if (PR_REGNUM_P (regno))
1392     return mode;
1393   if (known_ge (GET_MODE_SIZE (mode), 4))
1394     return mode;
1395   else
1396     return SImode;
1397 }
1398
1399 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1400    that strcpy from constants will be faster.  */
1401
1402 static HOST_WIDE_INT
1403 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1404 {
1405   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1406     return MAX (align, BITS_PER_WORD);
1407   return align;
1408 }
1409
1410 /* Return true if calls to DECL should be treated as
1411    long-calls (ie called via a register).  */
1412 static bool
1413 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1414 {
1415   return false;
1416 }
1417
1418 /* Return true if calls to symbol-ref SYM should be treated as
1419    long-calls (ie called via a register).  */
1420 bool
1421 aarch64_is_long_call_p (rtx sym)
1422 {
1423   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1424 }
1425
1426 /* Return true if calls to symbol-ref SYM should not go through
1427    plt stubs.  */
1428
1429 bool
1430 aarch64_is_noplt_call_p (rtx sym)
1431 {
1432   const_tree decl = SYMBOL_REF_DECL (sym);
1433
1434   if (flag_pic
1435       && decl
1436       && (!flag_plt
1437           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1438       && !targetm.binds_local_p (decl))
1439     return true;
1440
1441   return false;
1442 }
1443
1444 /* Return true if the offsets to a zero/sign-extract operation
1445    represent an expression that matches an extend operation.  The
1446    operands represent the paramters from
1447
1448    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1449 bool
1450 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1451                                 rtx extract_imm)
1452 {
1453   HOST_WIDE_INT mult_val, extract_val;
1454
1455   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1456     return false;
1457
1458   mult_val = INTVAL (mult_imm);
1459   extract_val = INTVAL (extract_imm);
1460
1461   if (extract_val > 8
1462       && extract_val < GET_MODE_BITSIZE (mode)
1463       && exact_log2 (extract_val & ~7) > 0
1464       && (extract_val & 7) <= 4
1465       && mult_val == (1 << (extract_val & 7)))
1466     return true;
1467
1468   return false;
1469 }
1470
1471 /* Emit an insn that's a simple single-set.  Both the operands must be
1472    known to be valid.  */
1473 inline static rtx_insn *
1474 emit_set_insn (rtx x, rtx y)
1475 {
1476   return emit_insn (gen_rtx_SET (x, y));
1477 }
1478
1479 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1480    return the rtx for register 0 in the proper mode.  */
1481 rtx
1482 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1483 {
1484   machine_mode mode = SELECT_CC_MODE (code, x, y);
1485   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1486
1487   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1488   return cc_reg;
1489 }
1490
1491 /* Build the SYMBOL_REF for __tls_get_addr.  */
1492
1493 static GTY(()) rtx tls_get_addr_libfunc;
1494
1495 rtx
1496 aarch64_tls_get_addr (void)
1497 {
1498   if (!tls_get_addr_libfunc)
1499     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1500   return tls_get_addr_libfunc;
1501 }
1502
1503 /* Return the TLS model to use for ADDR.  */
1504
1505 static enum tls_model
1506 tls_symbolic_operand_type (rtx addr)
1507 {
1508   enum tls_model tls_kind = TLS_MODEL_NONE;
1509   if (GET_CODE (addr) == CONST)
1510     {
1511       poly_int64 addend;
1512       rtx sym = strip_offset (addr, &addend);
1513       if (GET_CODE (sym) == SYMBOL_REF)
1514         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1515     }
1516   else if (GET_CODE (addr) == SYMBOL_REF)
1517     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1518
1519   return tls_kind;
1520 }
1521
1522 /* We'll allow lo_sum's in addresses in our legitimate addresses
1523    so that combine would take care of combining addresses where
1524    necessary, but for generation purposes, we'll generate the address
1525    as :
1526    RTL                               Absolute
1527    tmp = hi (symbol_ref);            adrp  x1, foo
1528    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1529                                      nop
1530
1531    PIC                               TLS
1532    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1533    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1534                                      bl   __tls_get_addr
1535                                      nop
1536
1537    Load TLS symbol, depending on TLS mechanism and TLS access model.
1538
1539    Global Dynamic - Traditional TLS:
1540    adrp tmp, :tlsgd:imm
1541    add  dest, tmp, #:tlsgd_lo12:imm
1542    bl   __tls_get_addr
1543
1544    Global Dynamic - TLS Descriptors:
1545    adrp dest, :tlsdesc:imm
1546    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1547    add  dest, dest, #:tlsdesc_lo12:imm
1548    blr  tmp
1549    mrs  tp, tpidr_el0
1550    add  dest, dest, tp
1551
1552    Initial Exec:
1553    mrs  tp, tpidr_el0
1554    adrp tmp, :gottprel:imm
1555    ldr  dest, [tmp, #:gottprel_lo12:imm]
1556    add  dest, dest, tp
1557
1558    Local Exec:
1559    mrs  tp, tpidr_el0
1560    add  t0, tp, #:tprel_hi12:imm, lsl #12
1561    add  t0, t0, #:tprel_lo12_nc:imm
1562 */
1563
1564 static void
1565 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1566                                    enum aarch64_symbol_type type)
1567 {
1568   switch (type)
1569     {
1570     case SYMBOL_SMALL_ABSOLUTE:
1571       {
1572         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1573         rtx tmp_reg = dest;
1574         machine_mode mode = GET_MODE (dest);
1575
1576         gcc_assert (mode == Pmode || mode == ptr_mode);
1577
1578         if (can_create_pseudo_p ())
1579           tmp_reg = gen_reg_rtx (mode);
1580
1581         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1582         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1583         return;
1584       }
1585
1586     case SYMBOL_TINY_ABSOLUTE:
1587       emit_insn (gen_rtx_SET (dest, imm));
1588       return;
1589
1590     case SYMBOL_SMALL_GOT_28K:
1591       {
1592         machine_mode mode = GET_MODE (dest);
1593         rtx gp_rtx = pic_offset_table_rtx;
1594         rtx insn;
1595         rtx mem;
1596
1597         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1598            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1599            decide rtx costs, in which case pic_offset_table_rtx is not
1600            initialized.  For that case no need to generate the first adrp
1601            instruction as the final cost for global variable access is
1602            one instruction.  */
1603         if (gp_rtx != NULL)
1604           {
1605             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1606                using the page base as GOT base, the first page may be wasted,
1607                in the worst scenario, there is only 28K space for GOT).
1608
1609                The generate instruction sequence for accessing global variable
1610                is:
1611
1612                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1613
1614                Only one instruction needed. But we must initialize
1615                pic_offset_table_rtx properly.  We generate initialize insn for
1616                every global access, and allow CSE to remove all redundant.
1617
1618                The final instruction sequences will look like the following
1619                for multiply global variables access.
1620
1621                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1622
1623                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1624                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1625                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1626                  ...  */
1627
1628             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1629             crtl->uses_pic_offset_table = 1;
1630             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1631
1632             if (mode != GET_MODE (gp_rtx))
1633              gp_rtx = gen_lowpart (mode, gp_rtx);
1634
1635           }
1636
1637         if (mode == ptr_mode)
1638           {
1639             if (mode == DImode)
1640               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1641             else
1642               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1643
1644             mem = XVECEXP (SET_SRC (insn), 0, 0);
1645           }
1646         else
1647           {
1648             gcc_assert (mode == Pmode);
1649
1650             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1651             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1652           }
1653
1654         /* The operand is expected to be MEM.  Whenever the related insn
1655            pattern changed, above code which calculate mem should be
1656            updated.  */
1657         gcc_assert (GET_CODE (mem) == MEM);
1658         MEM_READONLY_P (mem) = 1;
1659         MEM_NOTRAP_P (mem) = 1;
1660         emit_insn (insn);
1661         return;
1662       }
1663
1664     case SYMBOL_SMALL_GOT_4G:
1665       {
1666         /* In ILP32, the mode of dest can be either SImode or DImode,
1667            while the got entry is always of SImode size.  The mode of
1668            dest depends on how dest is used: if dest is assigned to a
1669            pointer (e.g. in the memory), it has SImode; it may have
1670            DImode if dest is dereferenced to access the memeory.
1671            This is why we have to handle three different ldr_got_small
1672            patterns here (two patterns for ILP32).  */
1673
1674         rtx insn;
1675         rtx mem;
1676         rtx tmp_reg = dest;
1677         machine_mode mode = GET_MODE (dest);
1678
1679         if (can_create_pseudo_p ())
1680           tmp_reg = gen_reg_rtx (mode);
1681
1682         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1683         if (mode == ptr_mode)
1684           {
1685             if (mode == DImode)
1686               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1687             else
1688               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1689
1690             mem = XVECEXP (SET_SRC (insn), 0, 0);
1691           }
1692         else
1693           {
1694             gcc_assert (mode == Pmode);
1695
1696             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1697             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1698           }
1699
1700         gcc_assert (GET_CODE (mem) == MEM);
1701         MEM_READONLY_P (mem) = 1;
1702         MEM_NOTRAP_P (mem) = 1;
1703         emit_insn (insn);
1704         return;
1705       }
1706
1707     case SYMBOL_SMALL_TLSGD:
1708       {
1709         rtx_insn *insns;
1710         machine_mode mode = GET_MODE (dest);
1711         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1712
1713         start_sequence ();
1714         if (TARGET_ILP32)
1715           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1716         else
1717           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1718         insns = get_insns ();
1719         end_sequence ();
1720
1721         RTL_CONST_CALL_P (insns) = 1;
1722         emit_libcall_block (insns, dest, result, imm);
1723         return;
1724       }
1725
1726     case SYMBOL_SMALL_TLSDESC:
1727       {
1728         machine_mode mode = GET_MODE (dest);
1729         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1730         rtx tp;
1731
1732         gcc_assert (mode == Pmode || mode == ptr_mode);
1733
1734         /* In ILP32, the got entry is always of SImode size.  Unlike
1735            small GOT, the dest is fixed at reg 0.  */
1736         if (TARGET_ILP32)
1737           emit_insn (gen_tlsdesc_small_si (imm));
1738         else
1739           emit_insn (gen_tlsdesc_small_di (imm));
1740         tp = aarch64_load_tp (NULL);
1741
1742         if (mode != Pmode)
1743           tp = gen_lowpart (mode, tp);
1744
1745         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1746         if (REG_P (dest))
1747           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1748         return;
1749       }
1750
1751     case SYMBOL_SMALL_TLSIE:
1752       {
1753         /* In ILP32, the mode of dest can be either SImode or DImode,
1754            while the got entry is always of SImode size.  The mode of
1755            dest depends on how dest is used: if dest is assigned to a
1756            pointer (e.g. in the memory), it has SImode; it may have
1757            DImode if dest is dereferenced to access the memeory.
1758            This is why we have to handle three different tlsie_small
1759            patterns here (two patterns for ILP32).  */
1760         machine_mode mode = GET_MODE (dest);
1761         rtx tmp_reg = gen_reg_rtx (mode);
1762         rtx tp = aarch64_load_tp (NULL);
1763
1764         if (mode == ptr_mode)
1765           {
1766             if (mode == DImode)
1767               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1768             else
1769               {
1770                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1771                 tp = gen_lowpart (mode, tp);
1772               }
1773           }
1774         else
1775           {
1776             gcc_assert (mode == Pmode);
1777             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1778           }
1779
1780         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1781         if (REG_P (dest))
1782           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1783         return;
1784       }
1785
1786     case SYMBOL_TLSLE12:
1787     case SYMBOL_TLSLE24:
1788     case SYMBOL_TLSLE32:
1789     case SYMBOL_TLSLE48:
1790       {
1791         machine_mode mode = GET_MODE (dest);
1792         rtx tp = aarch64_load_tp (NULL);
1793
1794         if (mode != Pmode)
1795           tp = gen_lowpart (mode, tp);
1796
1797         switch (type)
1798           {
1799           case SYMBOL_TLSLE12:
1800             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1801                         (dest, tp, imm));
1802             break;
1803           case SYMBOL_TLSLE24:
1804             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1805                         (dest, tp, imm));
1806           break;
1807           case SYMBOL_TLSLE32:
1808             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1809                         (dest, imm));
1810             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1811                         (dest, dest, tp));
1812           break;
1813           case SYMBOL_TLSLE48:
1814             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1815                         (dest, imm));
1816             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1817                         (dest, dest, tp));
1818             break;
1819           default:
1820             gcc_unreachable ();
1821           }
1822
1823         if (REG_P (dest))
1824           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1825         return;
1826       }
1827
1828     case SYMBOL_TINY_GOT:
1829       emit_insn (gen_ldr_got_tiny (dest, imm));
1830       return;
1831
1832     case SYMBOL_TINY_TLSIE:
1833       {
1834         machine_mode mode = GET_MODE (dest);
1835         rtx tp = aarch64_load_tp (NULL);
1836
1837         if (mode == ptr_mode)
1838           {
1839             if (mode == DImode)
1840               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1841             else
1842               {
1843                 tp = gen_lowpart (mode, tp);
1844                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1845               }
1846           }
1847         else
1848           {
1849             gcc_assert (mode == Pmode);
1850             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1851           }
1852
1853         if (REG_P (dest))
1854           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1855         return;
1856       }
1857
1858     default:
1859       gcc_unreachable ();
1860     }
1861 }
1862
1863 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1864    handle all moves if !can_create_pseudo_p ().  The distinction is
1865    important because, unlike emit_move_insn, the move expanders know
1866    how to force Pmode objects into the constant pool even when the
1867    constant pool address is not itself legitimate.  */
1868 static rtx
1869 aarch64_emit_move (rtx dest, rtx src)
1870 {
1871   return (can_create_pseudo_p ()
1872           ? emit_move_insn (dest, src)
1873           : emit_move_insn_1 (dest, src));
1874 }
1875
1876 /* Apply UNOPTAB to OP and store the result in DEST.  */
1877
1878 static void
1879 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1880 {
1881   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1882   if (dest != tmp)
1883     emit_move_insn (dest, tmp);
1884 }
1885
1886 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1887
1888 static void
1889 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1890 {
1891   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1892                           OPTAB_DIRECT);
1893   if (dest != tmp)
1894     emit_move_insn (dest, tmp);
1895 }
1896
1897 /* Split a 128-bit move operation into two 64-bit move operations,
1898    taking care to handle partial overlap of register to register
1899    copies.  Special cases are needed when moving between GP regs and
1900    FP regs.  SRC can be a register, constant or memory; DST a register
1901    or memory.  If either operand is memory it must not have any side
1902    effects.  */
1903 void
1904 aarch64_split_128bit_move (rtx dst, rtx src)
1905 {
1906   rtx dst_lo, dst_hi;
1907   rtx src_lo, src_hi;
1908
1909   machine_mode mode = GET_MODE (dst);
1910
1911   gcc_assert (mode == TImode || mode == TFmode);
1912   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1913   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1914
1915   if (REG_P (dst) && REG_P (src))
1916     {
1917       int src_regno = REGNO (src);
1918       int dst_regno = REGNO (dst);
1919
1920       /* Handle FP <-> GP regs.  */
1921       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1922         {
1923           src_lo = gen_lowpart (word_mode, src);
1924           src_hi = gen_highpart (word_mode, src);
1925
1926           if (mode == TImode)
1927             {
1928               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1929               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1930             }
1931           else
1932             {
1933               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1934               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1935             }
1936           return;
1937         }
1938       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1939         {
1940           dst_lo = gen_lowpart (word_mode, dst);
1941           dst_hi = gen_highpart (word_mode, dst);
1942
1943           if (mode == TImode)
1944             {
1945               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1946               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1947             }
1948           else
1949             {
1950               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1951               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1952             }
1953           return;
1954         }
1955     }
1956
1957   dst_lo = gen_lowpart (word_mode, dst);
1958   dst_hi = gen_highpart (word_mode, dst);
1959   src_lo = gen_lowpart (word_mode, src);
1960   src_hi = gen_highpart_mode (word_mode, mode, src);
1961
1962   /* At most one pairing may overlap.  */
1963   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1964     {
1965       aarch64_emit_move (dst_hi, src_hi);
1966       aarch64_emit_move (dst_lo, src_lo);
1967     }
1968   else
1969     {
1970       aarch64_emit_move (dst_lo, src_lo);
1971       aarch64_emit_move (dst_hi, src_hi);
1972     }
1973 }
1974
1975 bool
1976 aarch64_split_128bit_move_p (rtx dst, rtx src)
1977 {
1978   return (! REG_P (src)
1979           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1980 }
1981
1982 /* Split a complex SIMD combine.  */
1983
1984 void
1985 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1986 {
1987   machine_mode src_mode = GET_MODE (src1);
1988   machine_mode dst_mode = GET_MODE (dst);
1989
1990   gcc_assert (VECTOR_MODE_P (dst_mode));
1991   gcc_assert (register_operand (dst, dst_mode)
1992               && register_operand (src1, src_mode)
1993               && register_operand (src2, src_mode));
1994
1995   rtx (*gen) (rtx, rtx, rtx);
1996
1997   switch (src_mode)
1998     {
1999     case E_V8QImode:
2000       gen = gen_aarch64_simd_combinev8qi;
2001       break;
2002     case E_V4HImode:
2003       gen = gen_aarch64_simd_combinev4hi;
2004       break;
2005     case E_V2SImode:
2006       gen = gen_aarch64_simd_combinev2si;
2007       break;
2008     case E_V4HFmode:
2009       gen = gen_aarch64_simd_combinev4hf;
2010       break;
2011     case E_V2SFmode:
2012       gen = gen_aarch64_simd_combinev2sf;
2013       break;
2014     case E_DImode:
2015       gen = gen_aarch64_simd_combinedi;
2016       break;
2017     case E_DFmode:
2018       gen = gen_aarch64_simd_combinedf;
2019       break;
2020     default:
2021       gcc_unreachable ();
2022     }
2023
2024   emit_insn (gen (dst, src1, src2));
2025   return;
2026 }
2027
2028 /* Split a complex SIMD move.  */
2029
2030 void
2031 aarch64_split_simd_move (rtx dst, rtx src)
2032 {
2033   machine_mode src_mode = GET_MODE (src);
2034   machine_mode dst_mode = GET_MODE (dst);
2035
2036   gcc_assert (VECTOR_MODE_P (dst_mode));
2037
2038   if (REG_P (dst) && REG_P (src))
2039     {
2040       rtx (*gen) (rtx, rtx);
2041
2042       gcc_assert (VECTOR_MODE_P (src_mode));
2043
2044       switch (src_mode)
2045         {
2046         case E_V16QImode:
2047           gen = gen_aarch64_split_simd_movv16qi;
2048           break;
2049         case E_V8HImode:
2050           gen = gen_aarch64_split_simd_movv8hi;
2051           break;
2052         case E_V4SImode:
2053           gen = gen_aarch64_split_simd_movv4si;
2054           break;
2055         case E_V2DImode:
2056           gen = gen_aarch64_split_simd_movv2di;
2057           break;
2058         case E_V8HFmode:
2059           gen = gen_aarch64_split_simd_movv8hf;
2060           break;
2061         case E_V4SFmode:
2062           gen = gen_aarch64_split_simd_movv4sf;
2063           break;
2064         case E_V2DFmode:
2065           gen = gen_aarch64_split_simd_movv2df;
2066           break;
2067         default:
2068           gcc_unreachable ();
2069         }
2070
2071       emit_insn (gen (dst, src));
2072       return;
2073     }
2074 }
2075
2076 bool
2077 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2078                               machine_mode ymode, rtx y)
2079 {
2080   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2081   gcc_assert (r != NULL);
2082   return rtx_equal_p (x, r);
2083 }
2084
2085
2086 static rtx
2087 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2088 {
2089   if (can_create_pseudo_p ())
2090     return force_reg (mode, value);
2091   else
2092     {
2093       gcc_assert (x);
2094       aarch64_emit_move (x, value);
2095       return x;
2096     }
2097 }
2098
2099 /* Return true if we can move VALUE into a register using a single
2100    CNT[BHWD] instruction.  */
2101
2102 static bool
2103 aarch64_sve_cnt_immediate_p (poly_int64 value)
2104 {
2105   HOST_WIDE_INT factor = value.coeffs[0];
2106   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2107   return (value.coeffs[1] == factor
2108           && IN_RANGE (factor, 2, 16 * 16)
2109           && (factor & 1) == 0
2110           && factor <= 16 * (factor & -factor));
2111 }
2112
2113 /* Likewise for rtx X.  */
2114
2115 bool
2116 aarch64_sve_cnt_immediate_p (rtx x)
2117 {
2118   poly_int64 value;
2119   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2120 }
2121
2122 /* Return the asm string for an instruction with a CNT-like vector size
2123    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2124    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2125    first part of the operands template (the part that comes before the
2126    vector size itself).  FACTOR is the number of quadwords.
2127    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2128    If it is zero, we can use any element size.  */
2129
2130 static char *
2131 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2132                                   unsigned int factor,
2133                                   unsigned int nelts_per_vq)
2134 {
2135   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2136
2137   if (nelts_per_vq == 0)
2138     /* There is some overlap in the ranges of the four CNT instructions.
2139        Here we always use the smallest possible element size, so that the
2140        multiplier is 1 whereever possible.  */
2141     nelts_per_vq = factor & -factor;
2142   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2143   gcc_assert (IN_RANGE (shift, 1, 4));
2144   char suffix = "dwhb"[shift - 1];
2145
2146   factor >>= shift;
2147   unsigned int written;
2148   if (factor == 1)
2149     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2150                         prefix, suffix, operands);
2151   else
2152     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2153                         prefix, suffix, operands, factor);
2154   gcc_assert (written < sizeof (buffer));
2155   return buffer;
2156 }
2157
2158 /* Return the asm string for an instruction with a CNT-like vector size
2159    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2160    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2161    first part of the operands template (the part that comes before the
2162    vector size itself).  X is the value of the vector size operand,
2163    as a polynomial integer rtx.  */
2164
2165 char *
2166 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2167                                   rtx x)
2168 {
2169   poly_int64 value = rtx_to_poly_int64 (x);
2170   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2171   return aarch64_output_sve_cnt_immediate (prefix, operands,
2172                                            value.coeffs[1], 0);
2173 }
2174
2175 /* Return true if we can add VALUE to a register using a single ADDVL
2176    or ADDPL instruction.  */
2177
2178 static bool
2179 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2180 {
2181   HOST_WIDE_INT factor = value.coeffs[0];
2182   if (factor == 0 || value.coeffs[1] != factor)
2183     return false;
2184   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2185      and a value of 16 is one vector width.  */
2186   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2187           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2188 }
2189
2190 /* Likewise for rtx X.  */
2191
2192 bool
2193 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2194 {
2195   poly_int64 value;
2196   return (poly_int_rtx_p (x, &value)
2197           && aarch64_sve_addvl_addpl_immediate_p (value));
2198 }
2199
2200 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2201    and storing the result in operand 0.  */
2202
2203 char *
2204 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2205 {
2206   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2207   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2208   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2209
2210   /* Use INC or DEC if possible.  */
2211   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2212     {
2213       if (aarch64_sve_cnt_immediate_p (offset_value))
2214         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2215                                                  offset_value.coeffs[1], 0);
2216       if (aarch64_sve_cnt_immediate_p (-offset_value))
2217         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2218                                                  -offset_value.coeffs[1], 0);
2219     }
2220
2221   int factor = offset_value.coeffs[1];
2222   if ((factor & 15) == 0)
2223     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2224   else
2225     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2226   return buffer;
2227 }
2228
2229 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2230    instruction.  If it is, store the number of elements in each vector
2231    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2232    factor in *FACTOR_OUT (if nonnull).  */
2233
2234 bool
2235 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2236                                  unsigned int *nelts_per_vq_out)
2237 {
2238   rtx elt;
2239   poly_int64 value;
2240
2241   if (!const_vec_duplicate_p (x, &elt)
2242       || !poly_int_rtx_p (elt, &value))
2243     return false;
2244
2245   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2246   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2247     /* There's no vector INCB.  */
2248     return false;
2249
2250   HOST_WIDE_INT factor = value.coeffs[0];
2251   if (value.coeffs[1] != factor)
2252     return false;
2253
2254   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2255   if ((factor % nelts_per_vq) != 0
2256       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2257     return false;
2258
2259   if (factor_out)
2260     *factor_out = factor;
2261   if (nelts_per_vq_out)
2262     *nelts_per_vq_out = nelts_per_vq;
2263   return true;
2264 }
2265
2266 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2267    instruction.  */
2268
2269 bool
2270 aarch64_sve_inc_dec_immediate_p (rtx x)
2271 {
2272   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2273 }
2274
2275 /* Return the asm template for an SVE vector INC or DEC instruction.
2276    OPERANDS gives the operands before the vector count and X is the
2277    value of the vector count operand itself.  */
2278
2279 char *
2280 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2281 {
2282   int factor;
2283   unsigned int nelts_per_vq;
2284   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2285     gcc_unreachable ();
2286   if (factor < 0)
2287     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2288                                              nelts_per_vq);
2289   else
2290     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2291                                              nelts_per_vq);
2292 }
2293
2294 static int
2295 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2296                                 scalar_int_mode mode)
2297 {
2298   int i;
2299   unsigned HOST_WIDE_INT val, val2, mask;
2300   int one_match, zero_match;
2301   int num_insns;
2302
2303   val = INTVAL (imm);
2304
2305   if (aarch64_move_imm (val, mode))
2306     {
2307       if (generate)
2308         emit_insn (gen_rtx_SET (dest, imm));
2309       return 1;
2310     }
2311
2312   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2313      (with XXXX non-zero). In that case check to see if the move can be done in
2314      a smaller mode.  */
2315   val2 = val & 0xffffffff;
2316   if (mode == DImode
2317       && aarch64_move_imm (val2, SImode)
2318       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2319     {
2320       if (generate)
2321         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2322
2323       /* Check if we have to emit a second instruction by checking to see
2324          if any of the upper 32 bits of the original DI mode value is set.  */
2325       if (val == val2)
2326         return 1;
2327
2328       i = (val >> 48) ? 48 : 32;
2329
2330       if (generate)
2331          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2332                                     GEN_INT ((val >> i) & 0xffff)));
2333
2334       return 2;
2335     }
2336
2337   if ((val >> 32) == 0 || mode == SImode)
2338     {
2339       if (generate)
2340         {
2341           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2342           if (mode == SImode)
2343             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2344                                        GEN_INT ((val >> 16) & 0xffff)));
2345           else
2346             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2347                                        GEN_INT ((val >> 16) & 0xffff)));
2348         }
2349       return 2;
2350     }
2351
2352   /* Remaining cases are all for DImode.  */
2353
2354   mask = 0xffff;
2355   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2356     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2357   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2358     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2359
2360   if (zero_match != 2 && one_match != 2)
2361     {
2362       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2363          For a 64-bit bitmask try whether changing 16 bits to all ones or
2364          zeroes creates a valid bitmask.  To check any repeated bitmask,
2365          try using 16 bits from the other 32-bit half of val.  */
2366
2367       for (i = 0; i < 64; i += 16, mask <<= 16)
2368         {
2369           val2 = val & ~mask;
2370           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2371             break;
2372           val2 = val | mask;
2373           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2374             break;
2375           val2 = val2 & ~mask;
2376           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2377           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2378             break;
2379         }
2380       if (i != 64)
2381         {
2382           if (generate)
2383             {
2384               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2385               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2386                                          GEN_INT ((val >> i) & 0xffff)));
2387             }
2388           return 2;
2389         }
2390     }
2391
2392   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2393      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2394      otherwise skip zero bits.  */
2395
2396   num_insns = 1;
2397   mask = 0xffff;
2398   val2 = one_match > zero_match ? ~val : val;
2399   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2400
2401   if (generate)
2402     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2403                                            ? (val | ~(mask << i))
2404                                            : (val & (mask << i)))));
2405   for (i += 16; i < 64; i += 16)
2406     {
2407       if ((val2 & (mask << i)) == 0)
2408         continue;
2409       if (generate)
2410         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2411                                    GEN_INT ((val >> i) & 0xffff)));
2412       num_insns ++;
2413     }
2414
2415   return num_insns;
2416 }
2417
2418 /* Return whether imm is a 128-bit immediate which is simple enough to
2419    expand inline.  */
2420 bool
2421 aarch64_mov128_immediate (rtx imm)
2422 {
2423   if (GET_CODE (imm) == CONST_INT)
2424     return true;
2425
2426   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2427
2428   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2429   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2430
2431   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2432          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2433 }
2434
2435
2436 /* Return the number of temporary registers that aarch64_add_offset_1
2437    would need to add OFFSET to a register.  */
2438
2439 static unsigned int
2440 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2441 {
2442   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2443 }
2444
2445 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2446    a non-polynomial OFFSET.  MODE is the mode of the addition.
2447    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2448    be set and CFA adjustments added to the generated instructions.
2449
2450    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2451    temporary if register allocation is already complete.  This temporary
2452    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2453    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2454    the immediate again.
2455
2456    Since this function may be used to adjust the stack pointer, we must
2457    ensure that it cannot cause transient stack deallocation (for example
2458    by first incrementing SP and then decrementing when adjusting by a
2459    large immediate).  */
2460
2461 static void
2462 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2463                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2464                       bool frame_related_p, bool emit_move_imm)
2465 {
2466   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2467   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2468
2469   HOST_WIDE_INT moffset = abs_hwi (offset);
2470   rtx_insn *insn;
2471
2472   if (!moffset)
2473     {
2474       if (!rtx_equal_p (dest, src))
2475         {
2476           insn = emit_insn (gen_rtx_SET (dest, src));
2477           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2478         }
2479       return;
2480     }
2481
2482   /* Single instruction adjustment.  */
2483   if (aarch64_uimm12_shift (moffset))
2484     {
2485       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2486       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2487       return;
2488     }
2489
2490   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2491      and either:
2492
2493      a) the offset cannot be loaded by a 16-bit move or
2494      b) there is no spare register into which we can move it.  */
2495   if (moffset < 0x1000000
2496       && ((!temp1 && !can_create_pseudo_p ())
2497           || !aarch64_move_imm (moffset, mode)))
2498     {
2499       HOST_WIDE_INT low_off = moffset & 0xfff;
2500
2501       low_off = offset < 0 ? -low_off : low_off;
2502       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2503       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2504       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2505       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2506       return;
2507     }
2508
2509   /* Emit a move immediate if required and an addition/subtraction.  */
2510   if (emit_move_imm)
2511     {
2512       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2513       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2514     }
2515   insn = emit_insn (offset < 0
2516                     ? gen_sub3_insn (dest, src, temp1)
2517                     : gen_add3_insn (dest, src, temp1));
2518   if (frame_related_p)
2519     {
2520       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521       rtx adj = plus_constant (mode, src, offset);
2522       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2523     }
2524 }
2525
2526 /* Return the number of temporary registers that aarch64_add_offset
2527    would need to move OFFSET into a register or add OFFSET to a register;
2528    ADD_P is true if we want the latter rather than the former.  */
2529
2530 static unsigned int
2531 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2532 {
2533   /* This follows the same structure as aarch64_add_offset.  */
2534   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2535     return 0;
2536
2537   unsigned int count = 0;
2538   HOST_WIDE_INT factor = offset.coeffs[1];
2539   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2540   poly_int64 poly_offset (factor, factor);
2541   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2542     /* Need one register for the ADDVL/ADDPL result.  */
2543     count += 1;
2544   else if (factor != 0)
2545     {
2546       factor = abs (factor);
2547       if (factor > 16 * (factor & -factor))
2548         /* Need one register for the CNT result and one for the multiplication
2549            factor.  If necessary, the second temporary can be reused for the
2550            constant part of the offset.  */
2551         return 2;
2552       /* Need one register for the CNT result (which might then
2553          be shifted).  */
2554       count += 1;
2555     }
2556   return count + aarch64_add_offset_1_temporaries (constant);
2557 }
2558
2559 /* If X can be represented as a poly_int64, return the number
2560    of temporaries that are required to add it to a register.
2561    Return -1 otherwise.  */
2562
2563 int
2564 aarch64_add_offset_temporaries (rtx x)
2565 {
2566   poly_int64 offset;
2567   if (!poly_int_rtx_p (x, &offset))
2568     return -1;
2569   return aarch64_offset_temporaries (true, offset);
2570 }
2571
2572 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2573    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2574    be set and CFA adjustments added to the generated instructions.
2575
2576    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2577    temporary if register allocation is already complete.  This temporary
2578    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2579    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2580    false to avoid emitting the immediate again.
2581
2582    TEMP2, if nonnull, is a second temporary register that doesn't
2583    overlap either DEST or REG.
2584
2585    Since this function may be used to adjust the stack pointer, we must
2586    ensure that it cannot cause transient stack deallocation (for example
2587    by first incrementing SP and then decrementing when adjusting by a
2588    large immediate).  */
2589
2590 static void
2591 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2592                     poly_int64 offset, rtx temp1, rtx temp2,
2593                     bool frame_related_p, bool emit_move_imm = true)
2594 {
2595   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2596   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2597   gcc_assert (temp1 == NULL_RTX
2598               || !frame_related_p
2599               || !reg_overlap_mentioned_p (temp1, dest));
2600   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2601
2602   /* Try using ADDVL or ADDPL to add the whole value.  */
2603   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2604     {
2605       rtx offset_rtx = gen_int_mode (offset, mode);
2606       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2607       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2608       return;
2609     }
2610
2611   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2612      SVE vector register, over and above the minimum size of 128 bits.
2613      This is equivalent to half the value returned by CNTD with a
2614      vector shape of ALL.  */
2615   HOST_WIDE_INT factor = offset.coeffs[1];
2616   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2617
2618   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2619   poly_int64 poly_offset (factor, factor);
2620   if (src != const0_rtx
2621       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2622     {
2623       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2624       if (frame_related_p)
2625         {
2626           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2627           RTX_FRAME_RELATED_P (insn) = true;
2628           src = dest;
2629         }
2630       else
2631         {
2632           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2633           src = aarch64_force_temporary (mode, temp1, addr);
2634           temp1 = temp2;
2635           temp2 = NULL_RTX;
2636         }
2637     }
2638   /* Otherwise use a CNT-based sequence.  */
2639   else if (factor != 0)
2640     {
2641       /* Use a subtraction if we have a negative factor.  */
2642       rtx_code code = PLUS;
2643       if (factor < 0)
2644         {
2645           factor = -factor;
2646           code = MINUS;
2647         }
2648
2649       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2650          into the multiplication.  */
2651       rtx val;
2652       int shift = 0;
2653       if (factor & 1)
2654         /* Use a right shift by 1.  */
2655         shift = -1;
2656       else
2657         factor /= 2;
2658       HOST_WIDE_INT low_bit = factor & -factor;
2659       if (factor <= 16 * low_bit)
2660         {
2661           if (factor > 16 * 8)
2662             {
2663               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2664                  the value with the minimum multiplier and shift it into
2665                  position.  */
2666               int extra_shift = exact_log2 (low_bit);
2667               shift += extra_shift;
2668               factor >>= extra_shift;
2669             }
2670           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2671         }
2672       else
2673         {
2674           /* Use CNTD, then multiply it by FACTOR.  */
2675           val = gen_int_mode (poly_int64 (2, 2), mode);
2676           val = aarch64_force_temporary (mode, temp1, val);
2677
2678           /* Go back to using a negative multiplication factor if we have
2679              no register from which to subtract.  */
2680           if (code == MINUS && src == const0_rtx)
2681             {
2682               factor = -factor;
2683               code = PLUS;
2684             }
2685           rtx coeff1 = gen_int_mode (factor, mode);
2686           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2687           val = gen_rtx_MULT (mode, val, coeff1);
2688         }
2689
2690       if (shift > 0)
2691         {
2692           /* Multiply by 1 << SHIFT.  */
2693           val = aarch64_force_temporary (mode, temp1, val);
2694           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2695         }
2696       else if (shift == -1)
2697         {
2698           /* Divide by 2.  */
2699           val = aarch64_force_temporary (mode, temp1, val);
2700           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2701         }
2702
2703       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2704       if (src != const0_rtx)
2705         {
2706           val = aarch64_force_temporary (mode, temp1, val);
2707           val = gen_rtx_fmt_ee (code, mode, src, val);
2708         }
2709       else if (code == MINUS)
2710         {
2711           val = aarch64_force_temporary (mode, temp1, val);
2712           val = gen_rtx_NEG (mode, val);
2713         }
2714
2715       if (constant == 0 || frame_related_p)
2716         {
2717           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2718           if (frame_related_p)
2719             {
2720               RTX_FRAME_RELATED_P (insn) = true;
2721               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2722                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2723                                                               poly_offset)));
2724             }
2725           src = dest;
2726           if (constant == 0)
2727             return;
2728         }
2729       else
2730         {
2731           src = aarch64_force_temporary (mode, temp1, val);
2732           temp1 = temp2;
2733           temp2 = NULL_RTX;
2734         }
2735
2736       emit_move_imm = true;
2737     }
2738
2739   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2740                         frame_related_p, emit_move_imm);
2741 }
2742
2743 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2744    than a poly_int64.  */
2745
2746 void
2747 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2748                           rtx offset_rtx, rtx temp1, rtx temp2)
2749 {
2750   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2751                       temp1, temp2, false);
2752 }
2753
2754 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2755    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2756    if TEMP1 already contains abs (DELTA).  */
2757
2758 static inline void
2759 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2760 {
2761   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2762                       temp1, temp2, true, emit_move_imm);
2763 }
2764
2765 /* Subtract DELTA from the stack pointer, marking the instructions
2766    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2767    if nonnull.  */
2768
2769 static inline void
2770 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2771 {
2772   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2773                       temp1, temp2, frame_related_p);
2774 }
2775
2776 /* Set DEST to (vec_series BASE STEP).  */
2777
2778 static void
2779 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2780 {
2781   machine_mode mode = GET_MODE (dest);
2782   scalar_mode inner = GET_MODE_INNER (mode);
2783
2784   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2785   if (!aarch64_sve_index_immediate_p (base))
2786     base = force_reg (inner, base);
2787   if (!aarch64_sve_index_immediate_p (step))
2788     step = force_reg (inner, step);
2789
2790   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2791 }
2792
2793 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2794    integer of mode INT_MODE.  Return true on success.  */
2795
2796 static bool
2797 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2798                                       rtx src)
2799 {
2800   /* If the constant is smaller than 128 bits, we can do the move
2801      using a vector of SRC_MODEs.  */
2802   if (src_mode != TImode)
2803     {
2804       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2805                                      GET_MODE_SIZE (src_mode));
2806       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2807       emit_move_insn (gen_lowpart (dup_mode, dest),
2808                       gen_const_vec_duplicate (dup_mode, src));
2809       return true;
2810     }
2811
2812   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2813   src = force_const_mem (src_mode, src);
2814   if (!src)
2815     return false;
2816
2817   /* Make sure that the address is legitimate.  */
2818   if (!aarch64_sve_ld1r_operand_p (src))
2819     {
2820       rtx addr = force_reg (Pmode, XEXP (src, 0));
2821       src = replace_equiv_address (src, addr);
2822     }
2823
2824   machine_mode mode = GET_MODE (dest);
2825   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2826   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2827   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2828   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2829   emit_insn (gen_rtx_SET (dest, src));
2830   return true;
2831 }
2832
2833 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2834    isn't a simple duplicate or series.  */
2835
2836 static void
2837 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2838 {
2839   machine_mode mode = GET_MODE (src);
2840   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2841   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2842   gcc_assert (npatterns > 1);
2843
2844   if (nelts_per_pattern == 1)
2845     {
2846       /* The constant is a repeating seqeuence of at least two elements,
2847          where the repeating elements occupy no more than 128 bits.
2848          Get an integer representation of the replicated value.  */
2849       scalar_int_mode int_mode;
2850       if (BYTES_BIG_ENDIAN)
2851         /* For now, always use LD1RQ to load the value on big-endian
2852            targets, since the handling of smaller integers includes a
2853            subreg that is semantically an element reverse.  */
2854         int_mode = TImode;
2855       else
2856         {
2857           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2858           gcc_assert (int_bits <= 128);
2859           int_mode = int_mode_for_size (int_bits, 0).require ();
2860         }
2861       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2862       if (int_value
2863           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2864         return;
2865     }
2866
2867   /* Expand each pattern individually.  */
2868   rtx_vector_builder builder;
2869   auto_vec<rtx, 16> vectors (npatterns);
2870   for (unsigned int i = 0; i < npatterns; ++i)
2871     {
2872       builder.new_vector (mode, 1, nelts_per_pattern);
2873       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2874         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2875       vectors.quick_push (force_reg (mode, builder.build ()));
2876     }
2877
2878   /* Use permutes to interleave the separate vectors.  */
2879   while (npatterns > 1)
2880     {
2881       npatterns /= 2;
2882       for (unsigned int i = 0; i < npatterns; ++i)
2883         {
2884           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2885           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2886           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2887           vectors[i] = tmp;
2888         }
2889     }
2890   gcc_assert (vectors[0] == dest);
2891 }
2892
2893 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2894    is a pattern that can be used to set DEST to a replicated scalar
2895    element.  */
2896
2897 void
2898 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2899                               rtx (*gen_vec_duplicate) (rtx, rtx))
2900 {
2901   machine_mode mode = GET_MODE (dest);
2902
2903   /* Check on what type of symbol it is.  */
2904   scalar_int_mode int_mode;
2905   if ((GET_CODE (imm) == SYMBOL_REF
2906        || GET_CODE (imm) == LABEL_REF
2907        || GET_CODE (imm) == CONST
2908        || GET_CODE (imm) == CONST_POLY_INT)
2909       && is_a <scalar_int_mode> (mode, &int_mode))
2910     {
2911       rtx mem;
2912       poly_int64 offset;
2913       HOST_WIDE_INT const_offset;
2914       enum aarch64_symbol_type sty;
2915
2916       /* If we have (const (plus symbol offset)), separate out the offset
2917          before we start classifying the symbol.  */
2918       rtx base = strip_offset (imm, &offset);
2919
2920       /* We must always add an offset involving VL separately, rather than
2921          folding it into the relocation.  */
2922       if (!offset.is_constant (&const_offset))
2923         {
2924           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2925             emit_insn (gen_rtx_SET (dest, imm));
2926           else
2927             {
2928               /* Do arithmetic on 32-bit values if the result is smaller
2929                  than that.  */
2930               if (partial_subreg_p (int_mode, SImode))
2931                 {
2932                   /* It is invalid to do symbol calculations in modes
2933                      narrower than SImode.  */
2934                   gcc_assert (base == const0_rtx);
2935                   dest = gen_lowpart (SImode, dest);
2936                   int_mode = SImode;
2937                 }
2938               if (base != const0_rtx)
2939                 {
2940                   base = aarch64_force_temporary (int_mode, dest, base);
2941                   aarch64_add_offset (int_mode, dest, base, offset,
2942                                       NULL_RTX, NULL_RTX, false);
2943                 }
2944               else
2945                 aarch64_add_offset (int_mode, dest, base, offset,
2946                                     dest, NULL_RTX, false);
2947             }
2948           return;
2949         }
2950
2951       sty = aarch64_classify_symbol (base, const_offset);
2952       switch (sty)
2953         {
2954         case SYMBOL_FORCE_TO_MEM:
2955           if (const_offset != 0
2956               && targetm.cannot_force_const_mem (int_mode, imm))
2957             {
2958               gcc_assert (can_create_pseudo_p ());
2959               base = aarch64_force_temporary (int_mode, dest, base);
2960               aarch64_add_offset (int_mode, dest, base, const_offset,
2961                                   NULL_RTX, NULL_RTX, false);
2962               return;
2963             }
2964
2965           mem = force_const_mem (ptr_mode, imm);
2966           gcc_assert (mem);
2967
2968           /* If we aren't generating PC relative literals, then
2969              we need to expand the literal pool access carefully.
2970              This is something that needs to be done in a number
2971              of places, so could well live as a separate function.  */
2972           if (!aarch64_pcrelative_literal_loads)
2973             {
2974               gcc_assert (can_create_pseudo_p ());
2975               base = gen_reg_rtx (ptr_mode);
2976               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2977               if (ptr_mode != Pmode)
2978                 base = convert_memory_address (Pmode, base);
2979               mem = gen_rtx_MEM (ptr_mode, base);
2980             }
2981
2982           if (int_mode != ptr_mode)
2983             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2984
2985           emit_insn (gen_rtx_SET (dest, mem));
2986
2987           return;
2988
2989         case SYMBOL_SMALL_TLSGD:
2990         case SYMBOL_SMALL_TLSDESC:
2991         case SYMBOL_SMALL_TLSIE:
2992         case SYMBOL_SMALL_GOT_28K:
2993         case SYMBOL_SMALL_GOT_4G:
2994         case SYMBOL_TINY_GOT:
2995         case SYMBOL_TINY_TLSIE:
2996           if (const_offset != 0)
2997             {
2998               gcc_assert(can_create_pseudo_p ());
2999               base = aarch64_force_temporary (int_mode, dest, base);
3000               aarch64_add_offset (int_mode, dest, base, const_offset,
3001                                   NULL_RTX, NULL_RTX, false);
3002               return;
3003             }
3004           /* FALLTHRU */
3005
3006         case SYMBOL_SMALL_ABSOLUTE:
3007         case SYMBOL_TINY_ABSOLUTE:
3008         case SYMBOL_TLSLE12:
3009         case SYMBOL_TLSLE24:
3010         case SYMBOL_TLSLE32:
3011         case SYMBOL_TLSLE48:
3012           aarch64_load_symref_appropriately (dest, imm, sty);
3013           return;
3014
3015         default:
3016           gcc_unreachable ();
3017         }
3018     }
3019
3020   if (!CONST_INT_P (imm))
3021     {
3022       rtx base, step, value;
3023       if (GET_CODE (imm) == HIGH
3024           || aarch64_simd_valid_immediate (imm, NULL))
3025         emit_insn (gen_rtx_SET (dest, imm));
3026       else if (const_vec_series_p (imm, &base, &step))
3027         aarch64_expand_vec_series (dest, base, step);
3028       else if (const_vec_duplicate_p (imm, &value))
3029         {
3030           /* If the constant is out of range of an SVE vector move,
3031              load it from memory if we can, otherwise move it into
3032              a register and use a DUP.  */
3033           scalar_mode inner_mode = GET_MODE_INNER (mode);
3034           rtx op = force_const_mem (inner_mode, value);
3035           if (!op)
3036             op = force_reg (inner_mode, value);
3037           else if (!aarch64_sve_ld1r_operand_p (op))
3038             {
3039               rtx addr = force_reg (Pmode, XEXP (op, 0));
3040               op = replace_equiv_address (op, addr);
3041             }
3042           emit_insn (gen_vec_duplicate (dest, op));
3043         }
3044       else if (GET_CODE (imm) == CONST_VECTOR
3045                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3046         aarch64_expand_sve_const_vector (dest, imm);
3047       else
3048         {
3049           rtx mem = force_const_mem (mode, imm);
3050           gcc_assert (mem);
3051           emit_move_insn (dest, mem);
3052         }
3053
3054       return;
3055     }
3056
3057   aarch64_internal_mov_immediate (dest, imm, true,
3058                                   as_a <scalar_int_mode> (mode));
3059 }
3060
3061 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3062    that is known to contain PTRUE.  */
3063
3064 void
3065 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3066 {
3067   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3068                                                 gen_rtvec (2, pred, src),
3069                                                 UNSPEC_MERGE_PTRUE)));
3070 }
3071
3072 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3073    operand is in memory.  In this case we need to use the predicated LD1
3074    and ST1 instead of LDR and STR, both for correctness on big-endian
3075    targets and because LD1 and ST1 support a wider range of addressing modes.
3076    PRED_MODE is the mode of the predicate.
3077
3078    See the comment at the head of aarch64-sve.md for details about the
3079    big-endian handling.  */
3080
3081 void
3082 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3083 {
3084   machine_mode mode = GET_MODE (dest);
3085   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3086   if (!register_operand (src, mode)
3087       && !register_operand (dest, mode))
3088     {
3089       rtx tmp = gen_reg_rtx (mode);
3090       if (MEM_P (src))
3091         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3092       else
3093         emit_move_insn (tmp, src);
3094       src = tmp;
3095     }
3096   aarch64_emit_sve_pred_move (dest, ptrue, src);
3097 }
3098
3099 /* Called only on big-endian targets.  See whether an SVE vector move
3100    from SRC to DEST is effectively a REV[BHW] instruction, because at
3101    least one operand is a subreg of an SVE vector that has wider or
3102    narrower elements.  Return true and emit the instruction if so.
3103
3104    For example:
3105
3106      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3107
3108    represents a VIEW_CONVERT between the following vectors, viewed
3109    in memory order:
3110
3111      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3112      R1: { [0],      [1],      [2],      [3],     ... }
3113
3114    The high part of lane X in R2 should therefore correspond to lane X*2
3115    of R1, but the register representations are:
3116
3117          msb                                      lsb
3118      R2: ...... [1].high  [1].low   [0].high  [0].low
3119      R1: ...... [3]       [2]       [1]       [0]
3120
3121    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3122    We therefore need a reverse operation to swap the high and low values
3123    around.
3124
3125    This is purely an optimization.  Without it we would spill the
3126    subreg operand to the stack in one mode and reload it in the
3127    other mode, which has the same effect as the REV.  */
3128
3129 bool
3130 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3131 {
3132   gcc_assert (BYTES_BIG_ENDIAN);
3133   if (GET_CODE (dest) == SUBREG)
3134     dest = SUBREG_REG (dest);
3135   if (GET_CODE (src) == SUBREG)
3136     src = SUBREG_REG (src);
3137
3138   /* The optimization handles two single SVE REGs with different element
3139      sizes.  */
3140   if (!REG_P (dest)
3141       || !REG_P (src)
3142       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3143       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3144       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3145           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3146     return false;
3147
3148   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3149   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3150   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3151                                UNSPEC_REV_SUBREG);
3152   emit_insn (gen_rtx_SET (dest, unspec));
3153   return true;
3154 }
3155
3156 /* Return a copy of X with mode MODE, without changing its other
3157    attributes.  Unlike gen_lowpart, this doesn't care whether the
3158    mode change is valid.  */
3159
3160 static rtx
3161 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3162 {
3163   if (GET_MODE (x) == mode)
3164     return x;
3165
3166   x = shallow_copy_rtx (x);
3167   set_mode_and_regno (x, mode, REGNO (x));
3168   return x;
3169 }
3170
3171 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3172    operands.  */
3173
3174 void
3175 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3176 {
3177   /* Decide which REV operation we need.  The mode with narrower elements
3178      determines the mode of the operands and the mode with the wider
3179      elements determines the reverse width.  */
3180   machine_mode mode_with_wider_elts = GET_MODE (dest);
3181   machine_mode mode_with_narrower_elts = GET_MODE (src);
3182   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3183       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3184     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3185
3186   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3187   unsigned int unspec;
3188   if (wider_bytes == 8)
3189     unspec = UNSPEC_REV64;
3190   else if (wider_bytes == 4)
3191     unspec = UNSPEC_REV32;
3192   else if (wider_bytes == 2)
3193     unspec = UNSPEC_REV16;
3194   else
3195     gcc_unreachable ();
3196   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3197
3198   /* Emit:
3199
3200        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3201                          UNSPEC_MERGE_PTRUE))
3202
3203      with the appropriate modes.  */
3204   ptrue = gen_lowpart (pred_mode, ptrue);
3205   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3206   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3207   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3208   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3209                         UNSPEC_MERGE_PTRUE);
3210   emit_insn (gen_rtx_SET (dest, src));
3211 }
3212
3213 static bool
3214 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3215                                  tree exp ATTRIBUTE_UNUSED)
3216 {
3217   /* Currently, always true.  */
3218   return true;
3219 }
3220
3221 /* Implement TARGET_PASS_BY_REFERENCE.  */
3222
3223 static bool
3224 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3225                            machine_mode mode,
3226                            const_tree type,
3227                            bool named ATTRIBUTE_UNUSED)
3228 {
3229   HOST_WIDE_INT size;
3230   machine_mode dummymode;
3231   int nregs;
3232
3233   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3234   if (mode == BLKmode && type)
3235     size = int_size_in_bytes (type);
3236   else
3237     /* No frontends can create types with variable-sized modes, so we
3238        shouldn't be asked to pass or return them.  */
3239     size = GET_MODE_SIZE (mode).to_constant ();
3240
3241   /* Aggregates are passed by reference based on their size.  */
3242   if (type && AGGREGATE_TYPE_P (type))
3243     {
3244       size = int_size_in_bytes (type);
3245     }
3246
3247   /* Variable sized arguments are always returned by reference.  */
3248   if (size < 0)
3249     return true;
3250
3251   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3252   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3253                                                &dummymode, &nregs,
3254                                                NULL))
3255     return false;
3256
3257   /* Arguments which are variable sized or larger than 2 registers are
3258      passed by reference unless they are a homogenous floating point
3259      aggregate.  */
3260   return size > 2 * UNITS_PER_WORD;
3261 }
3262
3263 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3264 static bool
3265 aarch64_return_in_msb (const_tree valtype)
3266 {
3267   machine_mode dummy_mode;
3268   int dummy_int;
3269
3270   /* Never happens in little-endian mode.  */
3271   if (!BYTES_BIG_ENDIAN)
3272     return false;
3273
3274   /* Only composite types smaller than or equal to 16 bytes can
3275      be potentially returned in registers.  */
3276   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3277       || int_size_in_bytes (valtype) <= 0
3278       || int_size_in_bytes (valtype) > 16)
3279     return false;
3280
3281   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3282      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3283      is always passed/returned in the least significant bits of fp/simd
3284      register(s).  */
3285   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3286                                                &dummy_mode, &dummy_int, NULL))
3287     return false;
3288
3289   return true;
3290 }
3291
3292 /* Implement TARGET_FUNCTION_VALUE.
3293    Define how to find the value returned by a function.  */
3294
3295 static rtx
3296 aarch64_function_value (const_tree type, const_tree func,
3297                         bool outgoing ATTRIBUTE_UNUSED)
3298 {
3299   machine_mode mode;
3300   int unsignedp;
3301   int count;
3302   machine_mode ag_mode;
3303
3304   mode = TYPE_MODE (type);
3305   if (INTEGRAL_TYPE_P (type))
3306     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3307
3308   if (aarch64_return_in_msb (type))
3309     {
3310       HOST_WIDE_INT size = int_size_in_bytes (type);
3311
3312       if (size % UNITS_PER_WORD != 0)
3313         {
3314           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3315           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3316         }
3317     }
3318
3319   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3320                                                &ag_mode, &count, NULL))
3321     {
3322       if (!aarch64_composite_type_p (type, mode))
3323         {
3324           gcc_assert (count == 1 && mode == ag_mode);
3325           return gen_rtx_REG (mode, V0_REGNUM);
3326         }
3327       else
3328         {
3329           int i;
3330           rtx par;
3331
3332           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3333           for (i = 0; i < count; i++)
3334             {
3335               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3336               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3337               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3338               XVECEXP (par, 0, i) = tmp;
3339             }
3340           return par;
3341         }
3342     }
3343   else
3344     return gen_rtx_REG (mode, R0_REGNUM);
3345 }
3346
3347 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3348    Return true if REGNO is the number of a hard register in which the values
3349    of called function may come back.  */
3350
3351 static bool
3352 aarch64_function_value_regno_p (const unsigned int regno)
3353 {
3354   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3355      of 16-byte return values are: 128-bit integers and 16-byte small
3356      structures (excluding homogeneous floating-point aggregates).  */
3357   if (regno == R0_REGNUM || regno == R1_REGNUM)
3358     return true;
3359
3360   /* Up to four fp/simd registers can return a function value, e.g. a
3361      homogeneous floating-point aggregate having four members.  */
3362   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3363     return TARGET_FLOAT;
3364
3365   return false;
3366 }
3367
3368 /* Implement TARGET_RETURN_IN_MEMORY.
3369
3370    If the type T of the result of a function is such that
3371      void func (T arg)
3372    would require that arg be passed as a value in a register (or set of
3373    registers) according to the parameter passing rules, then the result
3374    is returned in the same registers as would be used for such an
3375    argument.  */
3376
3377 static bool
3378 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3379 {
3380   HOST_WIDE_INT size;
3381   machine_mode ag_mode;
3382   int count;
3383
3384   if (!AGGREGATE_TYPE_P (type)
3385       && TREE_CODE (type) != COMPLEX_TYPE
3386       && TREE_CODE (type) != VECTOR_TYPE)
3387     /* Simple scalar types always returned in registers.  */
3388     return false;
3389
3390   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3391                                                type,
3392                                                &ag_mode,
3393                                                &count,
3394                                                NULL))
3395     return false;
3396
3397   /* Types larger than 2 registers returned in memory.  */
3398   size = int_size_in_bytes (type);
3399   return (size < 0 || size > 2 * UNITS_PER_WORD);
3400 }
3401
3402 static bool
3403 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3404                                const_tree type, int *nregs)
3405 {
3406   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3407   return aarch64_vfp_is_call_or_return_candidate (mode,
3408                                                   type,
3409                                                   &pcum->aapcs_vfp_rmode,
3410                                                   nregs,
3411                                                   NULL);
3412 }
3413
3414 /* Given MODE and TYPE of a function argument, return the alignment in
3415    bits.  The idea is to suppress any stronger alignment requested by
3416    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3417    This is a helper function for local use only.  */
3418
3419 static unsigned int
3420 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3421 {
3422   if (!type)
3423     return GET_MODE_ALIGNMENT (mode);
3424
3425   if (integer_zerop (TYPE_SIZE (type)))
3426     return 0;
3427
3428   gcc_assert (TYPE_MODE (type) == mode);
3429
3430   if (!AGGREGATE_TYPE_P (type))
3431     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3432
3433   if (TREE_CODE (type) == ARRAY_TYPE)
3434     return TYPE_ALIGN (TREE_TYPE (type));
3435
3436   unsigned int alignment = 0;
3437   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3438     if (TREE_CODE (field) == FIELD_DECL)
3439       alignment = std::max (alignment, DECL_ALIGN (field));
3440
3441   return alignment;
3442 }
3443
3444 /* Layout a function argument according to the AAPCS64 rules.  The rule
3445    numbers refer to the rule numbers in the AAPCS64.  */
3446
3447 static void
3448 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3449                     const_tree type,
3450                     bool named ATTRIBUTE_UNUSED)
3451 {
3452   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3453   int ncrn, nvrn, nregs;
3454   bool allocate_ncrn, allocate_nvrn;
3455   HOST_WIDE_INT size;
3456
3457   /* We need to do this once per argument.  */
3458   if (pcum->aapcs_arg_processed)
3459     return;
3460
3461   pcum->aapcs_arg_processed = true;
3462
3463   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3464   if (type)
3465     size = int_size_in_bytes (type);
3466   else
3467     /* No frontends can create types with variable-sized modes, so we
3468        shouldn't be asked to pass or return them.  */
3469     size = GET_MODE_SIZE (mode).to_constant ();
3470   size = ROUND_UP (size, UNITS_PER_WORD);
3471
3472   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3473   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3474                                                  mode,
3475                                                  type,
3476                                                  &nregs);
3477
3478   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3479      The following code thus handles passing by SIMD/FP registers first.  */
3480
3481   nvrn = pcum->aapcs_nvrn;
3482
3483   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3484      and homogenous short-vector aggregates (HVA).  */
3485   if (allocate_nvrn)
3486     {
3487       if (!TARGET_FLOAT)
3488         aarch64_err_no_fpadvsimd (mode, "argument");
3489
3490       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3491         {
3492           pcum->aapcs_nextnvrn = nvrn + nregs;
3493           if (!aarch64_composite_type_p (type, mode))
3494             {
3495               gcc_assert (nregs == 1);
3496               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3497             }
3498           else
3499             {
3500               rtx par;
3501               int i;
3502               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3503               for (i = 0; i < nregs; i++)
3504                 {
3505                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3506                                          V0_REGNUM + nvrn + i);
3507                   rtx offset = gen_int_mode
3508                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3509                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3510                   XVECEXP (par, 0, i) = tmp;
3511                 }
3512               pcum->aapcs_reg = par;
3513             }
3514           return;
3515         }
3516       else
3517         {
3518           /* C.3 NSRN is set to 8.  */
3519           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3520           goto on_stack;
3521         }
3522     }
3523
3524   ncrn = pcum->aapcs_ncrn;
3525   nregs = size / UNITS_PER_WORD;
3526
3527   /* C6 - C9.  though the sign and zero extension semantics are
3528      handled elsewhere.  This is the case where the argument fits
3529      entirely general registers.  */
3530   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3531     {
3532
3533       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3534
3535       /* C.8 if the argument has an alignment of 16 then the NGRN is
3536          rounded up to the next even number.  */
3537       if (nregs == 2
3538           && ncrn % 2
3539           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3540              comparison is there because for > 16 * BITS_PER_UNIT
3541              alignment nregs should be > 2 and therefore it should be
3542              passed by reference rather than value.  */
3543           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3544         {
3545           ++ncrn;
3546           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3547         }
3548
3549       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3550          A reg is still generated for it, but the caller should be smart
3551          enough not to use it.  */
3552       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3553         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3554       else
3555         {
3556           rtx par;
3557           int i;
3558
3559           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3560           for (i = 0; i < nregs; i++)
3561             {
3562               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3563               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3564                                        GEN_INT (i * UNITS_PER_WORD));
3565               XVECEXP (par, 0, i) = tmp;
3566             }
3567           pcum->aapcs_reg = par;
3568         }
3569
3570       pcum->aapcs_nextncrn = ncrn + nregs;
3571       return;
3572     }
3573
3574   /* C.11  */
3575   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3576
3577   /* The argument is passed on stack; record the needed number of words for
3578      this argument and align the total size if necessary.  */
3579 on_stack:
3580   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3581
3582   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3583     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3584                                        16 / UNITS_PER_WORD);
3585   return;
3586 }
3587
3588 /* Implement TARGET_FUNCTION_ARG.  */
3589
3590 static rtx
3591 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3592                       const_tree type, bool named)
3593 {
3594   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3595   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3596
3597   if (mode == VOIDmode)
3598     return NULL_RTX;
3599
3600   aarch64_layout_arg (pcum_v, mode, type, named);
3601   return pcum->aapcs_reg;
3602 }
3603
3604 void
3605 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3606                            const_tree fntype ATTRIBUTE_UNUSED,
3607                            rtx libname ATTRIBUTE_UNUSED,
3608                            const_tree fndecl ATTRIBUTE_UNUSED,
3609                            unsigned n_named ATTRIBUTE_UNUSED)
3610 {
3611   pcum->aapcs_ncrn = 0;
3612   pcum->aapcs_nvrn = 0;
3613   pcum->aapcs_nextncrn = 0;
3614   pcum->aapcs_nextnvrn = 0;
3615   pcum->pcs_variant = ARM_PCS_AAPCS64;
3616   pcum->aapcs_reg = NULL_RTX;
3617   pcum->aapcs_arg_processed = false;
3618   pcum->aapcs_stack_words = 0;
3619   pcum->aapcs_stack_size = 0;
3620
3621   if (!TARGET_FLOAT
3622       && fndecl && TREE_PUBLIC (fndecl)
3623       && fntype && fntype != error_mark_node)
3624     {
3625       const_tree type = TREE_TYPE (fntype);
3626       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3627       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3628       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3629                                                    &mode, &nregs, NULL))
3630         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3631     }
3632   return;
3633 }
3634
3635 static void
3636 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3637                               machine_mode mode,
3638                               const_tree type,
3639                               bool named)
3640 {
3641   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3642   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3643     {
3644       aarch64_layout_arg (pcum_v, mode, type, named);
3645       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3646                   != (pcum->aapcs_stack_words != 0));
3647       pcum->aapcs_arg_processed = false;
3648       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3649       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3650       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3651       pcum->aapcs_stack_words = 0;
3652       pcum->aapcs_reg = NULL_RTX;
3653     }
3654 }
3655
3656 bool
3657 aarch64_function_arg_regno_p (unsigned regno)
3658 {
3659   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3660           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3661 }
3662
3663 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3664    PARM_BOUNDARY bits of alignment, but will be given anything up
3665    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3666    that both before and after the layout of each argument, the Next
3667    Stacked Argument Address (NSAA) will have a minimum alignment of
3668    8 bytes.  */
3669
3670 static unsigned int
3671 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3672 {
3673   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3674   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3675 }
3676
3677 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3678
3679 static fixed_size_mode
3680 aarch64_get_reg_raw_mode (int regno)
3681 {
3682   if (TARGET_SVE && FP_REGNUM_P (regno))
3683     /* Don't use the SVE part of the register for __builtin_apply and
3684        __builtin_return.  The SVE registers aren't used by the normal PCS,
3685        so using them there would be a waste of time.  The PCS extensions
3686        for SVE types are fundamentally incompatible with the
3687        __builtin_return/__builtin_apply interface.  */
3688     return as_a <fixed_size_mode> (V16QImode);
3689   return default_get_reg_raw_mode (regno);
3690 }
3691
3692 /* Implement TARGET_FUNCTION_ARG_PADDING.
3693
3694    Small aggregate types are placed in the lowest memory address.
3695
3696    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3697
3698 static pad_direction
3699 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3700 {
3701   /* On little-endian targets, the least significant byte of every stack
3702      argument is passed at the lowest byte address of the stack slot.  */
3703   if (!BYTES_BIG_ENDIAN)
3704     return PAD_UPWARD;
3705
3706   /* Otherwise, integral, floating-point and pointer types are padded downward:
3707      the least significant byte of a stack argument is passed at the highest
3708      byte address of the stack slot.  */
3709   if (type
3710       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3711          || POINTER_TYPE_P (type))
3712       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3713     return PAD_DOWNWARD;
3714
3715   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3716   return PAD_UPWARD;
3717 }
3718
3719 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3720
3721    It specifies padding for the last (may also be the only)
3722    element of a block move between registers and memory.  If
3723    assuming the block is in the memory, padding upward means that
3724    the last element is padded after its highest significant byte,
3725    while in downward padding, the last element is padded at the
3726    its least significant byte side.
3727
3728    Small aggregates and small complex types are always padded
3729    upwards.
3730
3731    We don't need to worry about homogeneous floating-point or
3732    short-vector aggregates; their move is not affected by the
3733    padding direction determined here.  Regardless of endianness,
3734    each element of such an aggregate is put in the least
3735    significant bits of a fp/simd register.
3736
3737    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3738    register has useful data, and return the opposite if the most
3739    significant byte does.  */
3740
3741 bool
3742 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3743                      bool first ATTRIBUTE_UNUSED)
3744 {
3745
3746   /* Small composite types are always padded upward.  */
3747   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3748     {
3749       HOST_WIDE_INT size;
3750       if (type)
3751         size = int_size_in_bytes (type);
3752       else
3753         /* No frontends can create types with variable-sized modes, so we
3754            shouldn't be asked to pass or return them.  */
3755         size = GET_MODE_SIZE (mode).to_constant ();
3756       if (size < 2 * UNITS_PER_WORD)
3757         return true;
3758     }
3759
3760   /* Otherwise, use the default padding.  */
3761   return !BYTES_BIG_ENDIAN;
3762 }
3763
3764 static scalar_int_mode
3765 aarch64_libgcc_cmp_return_mode (void)
3766 {
3767   return SImode;
3768 }
3769
3770 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3771
3772 /* We use the 12-bit shifted immediate arithmetic instructions so values
3773    must be multiple of (1 << 12), i.e. 4096.  */
3774 #define ARITH_FACTOR 4096
3775
3776 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3777 #error Cannot use simple address calculation for stack probing
3778 #endif
3779
3780 /* The pair of scratch registers used for stack probing.  */
3781 #define PROBE_STACK_FIRST_REG  9
3782 #define PROBE_STACK_SECOND_REG 10
3783
3784 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3785    inclusive.  These are offsets from the current stack pointer.  */
3786
3787 static void
3788 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3789 {
3790   HOST_WIDE_INT size;
3791   if (!poly_size.is_constant (&size))
3792     {
3793       sorry ("stack probes for SVE frames");
3794       return;
3795     }
3796
3797   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3798
3799   /* See the same assertion on PROBE_INTERVAL above.  */
3800   gcc_assert ((first % ARITH_FACTOR) == 0);
3801
3802   /* See if we have a constant small number of probes to generate.  If so,
3803      that's the easy case.  */
3804   if (size <= PROBE_INTERVAL)
3805     {
3806       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3807
3808       emit_set_insn (reg1,
3809                      plus_constant (Pmode,
3810                                     stack_pointer_rtx, -(first + base)));
3811       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3812     }
3813
3814   /* The run-time loop is made up of 8 insns in the generic case while the
3815      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3816   else if (size <= 4 * PROBE_INTERVAL)
3817     {
3818       HOST_WIDE_INT i, rem;
3819
3820       emit_set_insn (reg1,
3821                      plus_constant (Pmode,
3822                                     stack_pointer_rtx,
3823                                     -(first + PROBE_INTERVAL)));
3824       emit_stack_probe (reg1);
3825
3826       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3827          it exceeds SIZE.  If only two probes are needed, this will not
3828          generate any code.  Then probe at FIRST + SIZE.  */
3829       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3830         {
3831           emit_set_insn (reg1,
3832                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3833           emit_stack_probe (reg1);
3834         }
3835
3836       rem = size - (i - PROBE_INTERVAL);
3837       if (rem > 256)
3838         {
3839           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3840
3841           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3842           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3843         }
3844       else
3845         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3846     }
3847
3848   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3849      extra careful with variables wrapping around because we might be at
3850      the very top (or the very bottom) of the address space and we have
3851      to be able to handle this case properly; in particular, we use an
3852      equality test for the loop condition.  */
3853   else
3854     {
3855       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3856
3857       /* Step 1: round SIZE to the previous multiple of the interval.  */
3858
3859       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3860
3861
3862       /* Step 2: compute initial and final value of the loop counter.  */
3863
3864       /* TEST_ADDR = SP + FIRST.  */
3865       emit_set_insn (reg1,
3866                      plus_constant (Pmode, stack_pointer_rtx, -first));
3867
3868       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3869       HOST_WIDE_INT adjustment = - (first + rounded_size);
3870       if (! aarch64_uimm12_shift (adjustment))
3871         {
3872           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3873                                           true, Pmode);
3874           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3875         }
3876       else
3877         emit_set_insn (reg2,
3878                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3879
3880       /* Step 3: the loop
3881
3882          do
3883            {
3884              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3885              probe at TEST_ADDR
3886            }
3887          while (TEST_ADDR != LAST_ADDR)
3888
3889          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3890          until it is equal to ROUNDED_SIZE.  */
3891
3892       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3893
3894
3895       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3896          that SIZE is equal to ROUNDED_SIZE.  */
3897
3898       if (size != rounded_size)
3899         {
3900           HOST_WIDE_INT rem = size - rounded_size;
3901
3902           if (rem > 256)
3903             {
3904               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3905
3906               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3907               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3908             }
3909           else
3910             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3911         }
3912     }
3913
3914   /* Make sure nothing is scheduled before we are done.  */
3915   emit_insn (gen_blockage ());
3916 }
3917
3918 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3919    absolute addresses.  */
3920
3921 const char *
3922 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3923 {
3924   static int labelno = 0;
3925   char loop_lab[32];
3926   rtx xops[2];
3927
3928   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3929
3930   /* Loop.  */
3931   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3932
3933   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3934   xops[0] = reg1;
3935   xops[1] = GEN_INT (PROBE_INTERVAL);
3936   output_asm_insn ("sub\t%0, %0, %1", xops);
3937
3938   /* Probe at TEST_ADDR.  */
3939   output_asm_insn ("str\txzr, [%0]", xops);
3940
3941   /* Test if TEST_ADDR == LAST_ADDR.  */
3942   xops[1] = reg2;
3943   output_asm_insn ("cmp\t%0, %1", xops);
3944
3945   /* Branch.  */
3946   fputs ("\tb.ne\t", asm_out_file);
3947   assemble_name_raw (asm_out_file, loop_lab);
3948   fputc ('\n', asm_out_file);
3949
3950   return "";
3951 }
3952
3953 /* Mark the registers that need to be saved by the callee and calculate
3954    the size of the callee-saved registers area and frame record (both FP
3955    and LR may be omitted).  */
3956 static void
3957 aarch64_layout_frame (void)
3958 {
3959   HOST_WIDE_INT offset = 0;
3960   int regno, last_fp_reg = INVALID_REGNUM;
3961
3962   if (reload_completed && cfun->machine->frame.laid_out)
3963     return;
3964
3965   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3966   cfun->machine->frame.emit_frame_chain
3967     = frame_pointer_needed || crtl->calls_eh_return;
3968
3969   /* Emit a frame chain if the frame pointer is enabled.
3970      If -momit-leaf-frame-pointer is used, do not use a frame chain
3971      in leaf functions which do not use LR.  */
3972   if (flag_omit_frame_pointer == 2
3973       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3974            && !df_regs_ever_live_p (LR_REGNUM)))
3975     cfun->machine->frame.emit_frame_chain = true;
3976
3977 #define SLOT_NOT_REQUIRED (-2)
3978 #define SLOT_REQUIRED     (-1)
3979
3980   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3981   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3982
3983   /* First mark all the registers that really need to be saved...  */
3984   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3985     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3986
3987   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3988     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3989
3990   /* ... that includes the eh data registers (if needed)...  */
3991   if (crtl->calls_eh_return)
3992     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3993       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3994         = SLOT_REQUIRED;
3995
3996   /* ... and any callee saved register that dataflow says is live.  */
3997   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3998     if (df_regs_ever_live_p (regno)
3999         && (regno == R30_REGNUM
4000             || !call_used_regs[regno]))
4001       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4002
4003   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4004     if (df_regs_ever_live_p (regno)
4005         && !call_used_regs[regno])
4006       {
4007         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4008         last_fp_reg = regno;
4009       }
4010
4011   if (cfun->machine->frame.emit_frame_chain)
4012     {
4013       /* FP and LR are placed in the linkage record.  */
4014       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4015       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4016       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4017       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4018       offset = 2 * UNITS_PER_WORD;
4019     }
4020
4021   /* Now assign stack slots for them.  */
4022   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4023     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4024       {
4025         cfun->machine->frame.reg_offset[regno] = offset;
4026         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4027           cfun->machine->frame.wb_candidate1 = regno;
4028         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4029           cfun->machine->frame.wb_candidate2 = regno;
4030         offset += UNITS_PER_WORD;
4031       }
4032
4033   HOST_WIDE_INT max_int_offset = offset;
4034   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4035   bool has_align_gap = offset != max_int_offset;
4036
4037   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4038     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4039       {
4040         /* If there is an alignment gap between integer and fp callee-saves,
4041            allocate the last fp register to it if possible.  */
4042         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4043           {
4044             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4045             break;
4046           }
4047
4048         cfun->machine->frame.reg_offset[regno] = offset;
4049         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4050           cfun->machine->frame.wb_candidate1 = regno;
4051         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4052                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4053           cfun->machine->frame.wb_candidate2 = regno;
4054         offset += UNITS_PER_WORD;
4055       }
4056
4057   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4058
4059   cfun->machine->frame.saved_regs_size = offset;
4060
4061   HOST_WIDE_INT varargs_and_saved_regs_size
4062     = offset + cfun->machine->frame.saved_varargs_size;
4063
4064   cfun->machine->frame.hard_fp_offset
4065     = aligned_upper_bound (varargs_and_saved_regs_size
4066                            + get_frame_size (),
4067                            STACK_BOUNDARY / BITS_PER_UNIT);
4068
4069   /* Both these values are already aligned.  */
4070   gcc_assert (multiple_p (crtl->outgoing_args_size,
4071                           STACK_BOUNDARY / BITS_PER_UNIT));
4072   cfun->machine->frame.frame_size
4073     = (cfun->machine->frame.hard_fp_offset
4074        + crtl->outgoing_args_size);
4075
4076   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4077
4078   cfun->machine->frame.initial_adjust = 0;
4079   cfun->machine->frame.final_adjust = 0;
4080   cfun->machine->frame.callee_adjust = 0;
4081   cfun->machine->frame.callee_offset = 0;
4082
4083   HOST_WIDE_INT max_push_offset = 0;
4084   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4085     max_push_offset = 512;
4086   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4087     max_push_offset = 256;
4088
4089   HOST_WIDE_INT const_size, const_fp_offset;
4090   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4091       && const_size < max_push_offset
4092       && known_eq (crtl->outgoing_args_size, 0))
4093     {
4094       /* Simple, small frame with no outgoing arguments:
4095          stp reg1, reg2, [sp, -frame_size]!
4096          stp reg3, reg4, [sp, 16]  */
4097       cfun->machine->frame.callee_adjust = const_size;
4098     }
4099   else if (known_lt (crtl->outgoing_args_size
4100                      + cfun->machine->frame.saved_regs_size, 512)
4101            && !(cfun->calls_alloca
4102                 && known_lt (cfun->machine->frame.hard_fp_offset,
4103                              max_push_offset)))
4104     {
4105       /* Frame with small outgoing arguments:
4106          sub sp, sp, frame_size
4107          stp reg1, reg2, [sp, outgoing_args_size]
4108          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4109       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4110       cfun->machine->frame.callee_offset
4111         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4112     }
4113   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4114            && const_fp_offset < max_push_offset)
4115     {
4116       /* Frame with large outgoing arguments but a small local area:
4117          stp reg1, reg2, [sp, -hard_fp_offset]!
4118          stp reg3, reg4, [sp, 16]
4119          sub sp, sp, outgoing_args_size  */
4120       cfun->machine->frame.callee_adjust = const_fp_offset;
4121       cfun->machine->frame.final_adjust
4122         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4123     }
4124   else
4125     {
4126       /* Frame with large local area and outgoing arguments using frame pointer:
4127          sub sp, sp, hard_fp_offset
4128          stp x29, x30, [sp, 0]
4129          add x29, sp, 0
4130          stp reg3, reg4, [sp, 16]
4131          sub sp, sp, outgoing_args_size  */
4132       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4133       cfun->machine->frame.final_adjust
4134         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4135     }
4136
4137   cfun->machine->frame.laid_out = true;
4138 }
4139
4140 /* Return true if the register REGNO is saved on entry to
4141    the current function.  */
4142
4143 static bool
4144 aarch64_register_saved_on_entry (int regno)
4145 {
4146   return cfun->machine->frame.reg_offset[regno] >= 0;
4147 }
4148
4149 /* Return the next register up from REGNO up to LIMIT for the callee
4150    to save.  */
4151
4152 static unsigned
4153 aarch64_next_callee_save (unsigned regno, unsigned limit)
4154 {
4155   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4156     regno ++;
4157   return regno;
4158 }
4159
4160 /* Push the register number REGNO of mode MODE to the stack with write-back
4161    adjusting the stack by ADJUSTMENT.  */
4162
4163 static void
4164 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4165                            HOST_WIDE_INT adjustment)
4166  {
4167   rtx base_rtx = stack_pointer_rtx;
4168   rtx insn, reg, mem;
4169
4170   reg = gen_rtx_REG (mode, regno);
4171   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4172                             plus_constant (Pmode, base_rtx, -adjustment));
4173   mem = gen_frame_mem (mode, mem);
4174
4175   insn = emit_move_insn (mem, reg);
4176   RTX_FRAME_RELATED_P (insn) = 1;
4177 }
4178
4179 /* Generate and return an instruction to store the pair of registers
4180    REG and REG2 of mode MODE to location BASE with write-back adjusting
4181    the stack location BASE by ADJUSTMENT.  */
4182
4183 static rtx
4184 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4185                           HOST_WIDE_INT adjustment)
4186 {
4187   switch (mode)
4188     {
4189     case E_DImode:
4190       return gen_storewb_pairdi_di (base, base, reg, reg2,
4191                                     GEN_INT (-adjustment),
4192                                     GEN_INT (UNITS_PER_WORD - adjustment));
4193     case E_DFmode:
4194       return gen_storewb_pairdf_di (base, base, reg, reg2,
4195                                     GEN_INT (-adjustment),
4196                                     GEN_INT (UNITS_PER_WORD - adjustment));
4197     default:
4198       gcc_unreachable ();
4199     }
4200 }
4201
4202 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4203    stack pointer by ADJUSTMENT.  */
4204
4205 static void
4206 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4207 {
4208   rtx_insn *insn;
4209   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4210
4211   if (regno2 == INVALID_REGNUM)
4212     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4213
4214   rtx reg1 = gen_rtx_REG (mode, regno1);
4215   rtx reg2 = gen_rtx_REG (mode, regno2);
4216
4217   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4218                                               reg2, adjustment));
4219   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4220   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4221   RTX_FRAME_RELATED_P (insn) = 1;
4222 }
4223
4224 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4225    adjusting it by ADJUSTMENT afterwards.  */
4226
4227 static rtx
4228 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4229                          HOST_WIDE_INT adjustment)
4230 {
4231   switch (mode)
4232     {
4233     case E_DImode:
4234       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4235                                    GEN_INT (UNITS_PER_WORD));
4236     case E_DFmode:
4237       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4238                                    GEN_INT (UNITS_PER_WORD));
4239     default:
4240       gcc_unreachable ();
4241     }
4242 }
4243
4244 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4245    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4246    into CFI_OPS.  */
4247
4248 static void
4249 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4250                   rtx *cfi_ops)
4251 {
4252   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4253   rtx reg1 = gen_rtx_REG (mode, regno1);
4254
4255   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4256
4257   if (regno2 == INVALID_REGNUM)
4258     {
4259       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4260       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4261       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4262     }
4263   else
4264     {
4265       rtx reg2 = gen_rtx_REG (mode, regno2);
4266       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4267       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4268                                           reg2, adjustment));
4269     }
4270 }
4271
4272 /* Generate and return a store pair instruction of mode MODE to store
4273    register REG1 to MEM1 and register REG2 to MEM2.  */
4274
4275 static rtx
4276 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4277                         rtx reg2)
4278 {
4279   switch (mode)
4280     {
4281     case E_DImode:
4282       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4283
4284     case E_DFmode:
4285       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4286
4287     default:
4288       gcc_unreachable ();
4289     }
4290 }
4291
4292 /* Generate and regurn a load pair isntruction of mode MODE to load register
4293    REG1 from MEM1 and register REG2 from MEM2.  */
4294
4295 static rtx
4296 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4297                        rtx mem2)
4298 {
4299   switch (mode)
4300     {
4301     case E_DImode:
4302       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4303
4304     case E_DFmode:
4305       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4306
4307     default:
4308       gcc_unreachable ();
4309     }
4310 }
4311
4312 /* Return TRUE if return address signing should be enabled for the current
4313    function, otherwise return FALSE.  */
4314
4315 bool
4316 aarch64_return_address_signing_enabled (void)
4317 {
4318   /* This function should only be called after frame laid out.   */
4319   gcc_assert (cfun->machine->frame.laid_out);
4320
4321   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4322      if it's LR is pushed onto stack.  */
4323   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4324           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4325               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4326 }
4327
4328 /* Emit code to save the callee-saved registers from register number START
4329    to LIMIT to the stack at the location starting at offset START_OFFSET,
4330    skipping any write-back candidates if SKIP_WB is true.  */
4331
4332 static void
4333 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4334                            unsigned start, unsigned limit, bool skip_wb)
4335 {
4336   rtx_insn *insn;
4337   unsigned regno;
4338   unsigned regno2;
4339
4340   for (regno = aarch64_next_callee_save (start, limit);
4341        regno <= limit;
4342        regno = aarch64_next_callee_save (regno + 1, limit))
4343     {
4344       rtx reg, mem;
4345       poly_int64 offset;
4346
4347       if (skip_wb
4348           && (regno == cfun->machine->frame.wb_candidate1
4349               || regno == cfun->machine->frame.wb_candidate2))
4350         continue;
4351
4352       if (cfun->machine->reg_is_wrapped_separately[regno])
4353        continue;
4354
4355       reg = gen_rtx_REG (mode, regno);
4356       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4357       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4358                                                 offset));
4359
4360       regno2 = aarch64_next_callee_save (regno + 1, limit);
4361
4362       if (regno2 <= limit
4363           && !cfun->machine->reg_is_wrapped_separately[regno2]
4364           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4365               == cfun->machine->frame.reg_offset[regno2]))
4366
4367         {
4368           rtx reg2 = gen_rtx_REG (mode, regno2);
4369           rtx mem2;
4370
4371           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4372           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4373                                                      offset));
4374           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4375                                                     reg2));
4376
4377           /* The first part of a frame-related parallel insn is
4378              always assumed to be relevant to the frame
4379              calculations; subsequent parts, are only
4380              frame-related if explicitly marked.  */
4381           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4382           regno = regno2;
4383         }
4384       else
4385         insn = emit_move_insn (mem, reg);
4386
4387       RTX_FRAME_RELATED_P (insn) = 1;
4388     }
4389 }
4390
4391 /* Emit code to restore the callee registers of mode MODE from register
4392    number START up to and including LIMIT.  Restore from the stack offset
4393    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4394    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4395
4396 static void
4397 aarch64_restore_callee_saves (machine_mode mode,
4398                               poly_int64 start_offset, unsigned start,
4399                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4400 {
4401   rtx base_rtx = stack_pointer_rtx;
4402   unsigned regno;
4403   unsigned regno2;
4404   poly_int64 offset;
4405
4406   for (regno = aarch64_next_callee_save (start, limit);
4407        regno <= limit;
4408        regno = aarch64_next_callee_save (regno + 1, limit))
4409     {
4410       if (cfun->machine->reg_is_wrapped_separately[regno])
4411        continue;
4412
4413       rtx reg, mem;
4414
4415       if (skip_wb
4416           && (regno == cfun->machine->frame.wb_candidate1
4417               || regno == cfun->machine->frame.wb_candidate2))
4418         continue;
4419
4420       reg = gen_rtx_REG (mode, regno);
4421       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4422       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4423
4424       regno2 = aarch64_next_callee_save (regno + 1, limit);
4425
4426       if (regno2 <= limit
4427           && !cfun->machine->reg_is_wrapped_separately[regno2]
4428           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4429               == cfun->machine->frame.reg_offset[regno2]))
4430         {
4431           rtx reg2 = gen_rtx_REG (mode, regno2);
4432           rtx mem2;
4433
4434           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4435           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4436           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4437
4438           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4439           regno = regno2;
4440         }
4441       else
4442         emit_move_insn (reg, mem);
4443       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4444     }
4445 }
4446
4447 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4448    of MODE.  */
4449
4450 static inline bool
4451 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4452 {
4453   HOST_WIDE_INT multiple;
4454   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4455           && IN_RANGE (multiple, -8, 7));
4456 }
4457
4458 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4459    of MODE.  */
4460
4461 static inline bool
4462 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4463 {
4464   HOST_WIDE_INT multiple;
4465   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4466           && IN_RANGE (multiple, 0, 63));
4467 }
4468
4469 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4470    of MODE.  */
4471
4472 bool
4473 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4474 {
4475   HOST_WIDE_INT multiple;
4476   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4477           && IN_RANGE (multiple, -64, 63));
4478 }
4479
4480 /* Return true if OFFSET is a signed 9-bit value.  */
4481
4482 static inline bool
4483 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4484                                poly_int64 offset)
4485 {
4486   HOST_WIDE_INT const_offset;
4487   return (offset.is_constant (&const_offset)
4488           && IN_RANGE (const_offset, -256, 255));
4489 }
4490
4491 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4492    of MODE.  */
4493
4494 static inline bool
4495 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4496 {
4497   HOST_WIDE_INT multiple;
4498   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4499           && IN_RANGE (multiple, -256, 255));
4500 }
4501
4502 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4503    of MODE.  */
4504
4505 static inline bool
4506 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4507 {
4508   HOST_WIDE_INT multiple;
4509   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4510           && IN_RANGE (multiple, 0, 4095));
4511 }
4512
4513 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4514
4515 static sbitmap
4516 aarch64_get_separate_components (void)
4517 {
4518   aarch64_layout_frame ();
4519
4520   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4521   bitmap_clear (components);
4522
4523   /* The registers we need saved to the frame.  */
4524   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4525     if (aarch64_register_saved_on_entry (regno))
4526       {
4527         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4528         if (!frame_pointer_needed)
4529           offset += cfun->machine->frame.frame_size
4530                     - cfun->machine->frame.hard_fp_offset;
4531         /* Check that we can access the stack slot of the register with one
4532            direct load with no adjustments needed.  */
4533         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4534           bitmap_set_bit (components, regno);
4535       }
4536
4537   /* Don't mess with the hard frame pointer.  */
4538   if (frame_pointer_needed)
4539     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4540
4541   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4542   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4543   /* If aarch64_layout_frame has chosen registers to store/restore with
4544      writeback don't interfere with them to avoid having to output explicit
4545      stack adjustment instructions.  */
4546   if (reg2 != INVALID_REGNUM)
4547     bitmap_clear_bit (components, reg2);
4548   if (reg1 != INVALID_REGNUM)
4549     bitmap_clear_bit (components, reg1);
4550
4551   bitmap_clear_bit (components, LR_REGNUM);
4552   bitmap_clear_bit (components, SP_REGNUM);
4553
4554   return components;
4555 }
4556
4557 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4558
4559 static sbitmap
4560 aarch64_components_for_bb (basic_block bb)
4561 {
4562   bitmap in = DF_LIVE_IN (bb);
4563   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4564   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4565
4566   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4567   bitmap_clear (components);
4568
4569   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4570   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4571     if ((!call_used_regs[regno])
4572        && (bitmap_bit_p (in, regno)
4573            || bitmap_bit_p (gen, regno)
4574            || bitmap_bit_p (kill, regno)))
4575       {
4576         unsigned regno2, offset, offset2;
4577         bitmap_set_bit (components, regno);
4578
4579         /* If there is a callee-save at an adjacent offset, add it too
4580            to increase the use of LDP/STP.  */
4581         offset = cfun->machine->frame.reg_offset[regno];
4582         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4583
4584         if (regno2 <= LAST_SAVED_REGNUM)
4585           {
4586             offset2 = cfun->machine->frame.reg_offset[regno2];
4587             if ((offset & ~8) == (offset2 & ~8))
4588               bitmap_set_bit (components, regno2);
4589           }
4590       }
4591
4592   return components;
4593 }
4594
4595 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4596    Nothing to do for aarch64.  */
4597
4598 static void
4599 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4600 {
4601 }
4602
4603 /* Return the next set bit in BMP from START onwards.  Return the total number
4604    of bits in BMP if no set bit is found at or after START.  */
4605
4606 static unsigned int
4607 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4608 {
4609   unsigned int nbits = SBITMAP_SIZE (bmp);
4610   if (start == nbits)
4611     return start;
4612
4613   gcc_assert (start < nbits);
4614   for (unsigned int i = start; i < nbits; i++)
4615     if (bitmap_bit_p (bmp, i))
4616       return i;
4617
4618   return nbits;
4619 }
4620
4621 /* Do the work for aarch64_emit_prologue_components and
4622    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4623    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4624    for these components or the epilogue sequence.  That is, it determines
4625    whether we should emit stores or loads and what kind of CFA notes to attach
4626    to the insns.  Otherwise the logic for the two sequences is very
4627    similar.  */
4628
4629 static void
4630 aarch64_process_components (sbitmap components, bool prologue_p)
4631 {
4632   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4633                              ? HARD_FRAME_POINTER_REGNUM
4634                              : STACK_POINTER_REGNUM);
4635
4636   unsigned last_regno = SBITMAP_SIZE (components);
4637   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4638   rtx_insn *insn = NULL;
4639
4640   while (regno != last_regno)
4641     {
4642       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4643          so DFmode for the vector registers is enough.  */
4644       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4645       rtx reg = gen_rtx_REG (mode, regno);
4646       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4647       if (!frame_pointer_needed)
4648         offset += cfun->machine->frame.frame_size
4649                   - cfun->machine->frame.hard_fp_offset;
4650       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4651       rtx mem = gen_frame_mem (mode, addr);
4652
4653       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4654       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4655       /* No more registers to handle after REGNO.
4656          Emit a single save/restore and exit.  */
4657       if (regno2 == last_regno)
4658         {
4659           insn = emit_insn (set);
4660           RTX_FRAME_RELATED_P (insn) = 1;
4661           if (prologue_p)
4662             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4663           else
4664             add_reg_note (insn, REG_CFA_RESTORE, reg);
4665           break;
4666         }
4667
4668       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4669       /* The next register is not of the same class or its offset is not
4670          mergeable with the current one into a pair.  */
4671       if (!satisfies_constraint_Ump (mem)
4672           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4673           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4674                        GET_MODE_SIZE (mode)))
4675         {
4676           insn = emit_insn (set);
4677           RTX_FRAME_RELATED_P (insn) = 1;
4678           if (prologue_p)
4679             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4680           else
4681             add_reg_note (insn, REG_CFA_RESTORE, reg);
4682
4683           regno = regno2;
4684           continue;
4685         }
4686
4687       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4688       rtx reg2 = gen_rtx_REG (mode, regno2);
4689       if (!frame_pointer_needed)
4690         offset2 += cfun->machine->frame.frame_size
4691                   - cfun->machine->frame.hard_fp_offset;
4692       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4693       rtx mem2 = gen_frame_mem (mode, addr2);
4694       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4695                              : gen_rtx_SET (reg2, mem2);
4696
4697       if (prologue_p)
4698         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4699       else
4700         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4701
4702       RTX_FRAME_RELATED_P (insn) = 1;
4703       if (prologue_p)
4704         {
4705           add_reg_note (insn, REG_CFA_OFFSET, set);
4706           add_reg_note (insn, REG_CFA_OFFSET, set2);
4707         }
4708       else
4709         {
4710           add_reg_note (insn, REG_CFA_RESTORE, reg);
4711           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4712         }
4713
4714       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4715     }
4716 }
4717
4718 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4719
4720 static void
4721 aarch64_emit_prologue_components (sbitmap components)
4722 {
4723   aarch64_process_components (components, true);
4724 }
4725
4726 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4727
4728 static void
4729 aarch64_emit_epilogue_components (sbitmap components)
4730 {
4731   aarch64_process_components (components, false);
4732 }
4733
4734 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4735
4736 static void
4737 aarch64_set_handled_components (sbitmap components)
4738 {
4739   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4740     if (bitmap_bit_p (components, regno))
4741       cfun->machine->reg_is_wrapped_separately[regno] = true;
4742 }
4743
4744 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4745    is saved at BASE + OFFSET.  */
4746
4747 static void
4748 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4749                             rtx base, poly_int64 offset)
4750 {
4751   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4752   add_reg_note (insn, REG_CFA_EXPRESSION,
4753                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4754 }
4755
4756 /* AArch64 stack frames generated by this compiler look like:
4757
4758         +-------------------------------+
4759         |                               |
4760         |  incoming stack arguments     |
4761         |                               |
4762         +-------------------------------+
4763         |                               | <-- incoming stack pointer (aligned)
4764         |  callee-allocated save area   |
4765         |  for register varargs         |
4766         |                               |
4767         +-------------------------------+
4768         |  local variables              | <-- frame_pointer_rtx
4769         |                               |
4770         +-------------------------------+
4771         |  padding0                     | \
4772         +-------------------------------+  |
4773         |  callee-saved registers       |  | frame.saved_regs_size
4774         +-------------------------------+  |
4775         |  LR'                          |  |
4776         +-------------------------------+  |
4777         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4778         +-------------------------------+
4779         |  dynamic allocation           |
4780         +-------------------------------+
4781         |  padding                      |
4782         +-------------------------------+
4783         |  outgoing stack arguments     | <-- arg_pointer
4784         |                               |
4785         +-------------------------------+
4786         |                               | <-- stack_pointer_rtx (aligned)
4787
4788    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4789    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4790    unchanged.  */
4791
4792 /* Generate the prologue instructions for entry into a function.
4793    Establish the stack frame by decreasing the stack pointer with a
4794    properly calculated size and, if necessary, create a frame record
4795    filled with the values of LR and previous frame pointer.  The
4796    current FP is also set up if it is in use.  */
4797
4798 void
4799 aarch64_expand_prologue (void)
4800 {
4801   aarch64_layout_frame ();
4802
4803   poly_int64 frame_size = cfun->machine->frame.frame_size;
4804   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4805   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4806   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4807   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4808   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4809   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4810   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4811   rtx_insn *insn;
4812
4813   /* Sign return address for functions.  */
4814   if (aarch64_return_address_signing_enabled ())
4815     {
4816       insn = emit_insn (gen_pacisp ());
4817       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4818       RTX_FRAME_RELATED_P (insn) = 1;
4819     }
4820
4821   if (flag_stack_usage_info)
4822     current_function_static_stack_size = constant_lower_bound (frame_size);
4823
4824   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4825     {
4826       if (crtl->is_leaf && !cfun->calls_alloca)
4827         {
4828           if (maybe_gt (frame_size, PROBE_INTERVAL)
4829               && maybe_gt (frame_size, get_stack_check_protect ()))
4830             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4831                                             (frame_size
4832                                              - get_stack_check_protect ()));
4833         }
4834       else if (maybe_gt (frame_size, 0))
4835         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4836     }
4837
4838   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4839   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4840
4841   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4842
4843   if (callee_adjust != 0)
4844     aarch64_push_regs (reg1, reg2, callee_adjust);
4845
4846   if (emit_frame_chain)
4847     {
4848       poly_int64 reg_offset = callee_adjust;
4849       if (callee_adjust == 0)
4850         {
4851           reg1 = R29_REGNUM;
4852           reg2 = R30_REGNUM;
4853           reg_offset = callee_offset;
4854           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4855         }
4856       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4857                           stack_pointer_rtx, callee_offset,
4858                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4859       if (frame_pointer_needed && !frame_size.is_constant ())
4860         {
4861           /* Variable-sized frames need to describe the save slot
4862              address using DW_CFA_expression rather than DW_CFA_offset.
4863              This means that, without taking further action, the
4864              locations of the registers that we've already saved would
4865              remain based on the stack pointer even after we redefine
4866              the CFA based on the frame pointer.  We therefore need new
4867              DW_CFA_expressions to re-express the save slots with addresses
4868              based on the frame pointer.  */
4869           rtx_insn *insn = get_last_insn ();
4870           gcc_assert (RTX_FRAME_RELATED_P (insn));
4871
4872           /* Add an explicit CFA definition if this was previously
4873              implicit.  */
4874           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4875             {
4876               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4877                                        callee_offset);
4878               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4879                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4880             }
4881
4882           /* Change the save slot expressions for the registers that
4883              we've already saved.  */
4884           reg_offset -= callee_offset;
4885           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4886                                       reg_offset + UNITS_PER_WORD);
4887           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4888                                       reg_offset);
4889         }
4890       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4891     }
4892
4893   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4894                              callee_adjust != 0 || emit_frame_chain);
4895   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4896                              callee_adjust != 0 || emit_frame_chain);
4897   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4898 }
4899
4900 /* Return TRUE if we can use a simple_return insn.
4901
4902    This function checks whether the callee saved stack is empty, which
4903    means no restore actions are need. The pro_and_epilogue will use
4904    this to check whether shrink-wrapping opt is feasible.  */
4905
4906 bool
4907 aarch64_use_return_insn_p (void)
4908 {
4909   if (!reload_completed)
4910     return false;
4911
4912   if (crtl->profile)
4913     return false;
4914
4915   aarch64_layout_frame ();
4916
4917   return known_eq (cfun->machine->frame.frame_size, 0);
4918 }
4919
4920 /* Generate the epilogue instructions for returning from a function.
4921    This is almost exactly the reverse of the prolog sequence, except
4922    that we need to insert barriers to avoid scheduling loads that read
4923    from a deallocated stack, and we optimize the unwind records by
4924    emitting them all together if possible.  */
4925 void
4926 aarch64_expand_epilogue (bool for_sibcall)
4927 {
4928   aarch64_layout_frame ();
4929
4930   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4931   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4932   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4933   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4934   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4935   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4936   rtx cfi_ops = NULL;
4937   rtx_insn *insn;
4938   /* A stack clash protection prologue may not have left IP0_REGNUM or
4939      IP1_REGNUM in a usable state.  The same is true for allocations
4940      with an SVE component, since we then need both temporary registers
4941      for each allocation.  */
4942   bool can_inherit_p = (initial_adjust.is_constant ()
4943                         && final_adjust.is_constant ()
4944                         && !flag_stack_clash_protection);
4945
4946   /* We need to add memory barrier to prevent read from deallocated stack.  */
4947   bool need_barrier_p
4948     = maybe_ne (get_frame_size ()
4949                 + cfun->machine->frame.saved_varargs_size, 0);
4950
4951   /* Emit a barrier to prevent loads from a deallocated stack.  */
4952   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4953       || cfun->calls_alloca
4954       || crtl->calls_eh_return)
4955     {
4956       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4957       need_barrier_p = false;
4958     }
4959
4960   /* Restore the stack pointer from the frame pointer if it may not
4961      be the same as the stack pointer.  */
4962   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4963   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4964   if (frame_pointer_needed
4965       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4966     /* If writeback is used when restoring callee-saves, the CFA
4967        is restored on the instruction doing the writeback.  */
4968     aarch64_add_offset (Pmode, stack_pointer_rtx,
4969                         hard_frame_pointer_rtx, -callee_offset,
4970                         ip1_rtx, ip0_rtx, callee_adjust == 0);
4971   else
4972     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4973                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4974
4975   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4976                                 callee_adjust != 0, &cfi_ops);
4977   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4978                                 callee_adjust != 0, &cfi_ops);
4979
4980   if (need_barrier_p)
4981     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4982
4983   if (callee_adjust != 0)
4984     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4985
4986   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4987     {
4988       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
4989       insn = get_last_insn ();
4990       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4991       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4992       RTX_FRAME_RELATED_P (insn) = 1;
4993       cfi_ops = NULL;
4994     }
4995
4996   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4997                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4998
4999   if (cfi_ops)
5000     {
5001       /* Emit delayed restores and reset the CFA to be SP.  */
5002       insn = get_last_insn ();
5003       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5004       REG_NOTES (insn) = cfi_ops;
5005       RTX_FRAME_RELATED_P (insn) = 1;
5006     }
5007
5008   /* We prefer to emit the combined return/authenticate instruction RETAA,
5009      however there are three cases in which we must instead emit an explicit
5010      authentication instruction.
5011
5012         1) Sibcalls don't return in a normal way, so if we're about to call one
5013            we must authenticate.
5014
5015         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5016            generating code for !TARGET_ARMV8_3 we can't use it and must
5017            explicitly authenticate.
5018
5019         3) On an eh_return path we make extra stack adjustments to update the
5020            canonical frame address to be the exception handler's CFA.  We want
5021            to authenticate using the CFA of the function which calls eh_return.
5022     */
5023   if (aarch64_return_address_signing_enabled ()
5024       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5025     {
5026       insn = emit_insn (gen_autisp ());
5027       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5028       RTX_FRAME_RELATED_P (insn) = 1;
5029     }
5030
5031   /* Stack adjustment for exception handler.  */
5032   if (crtl->calls_eh_return)
5033     {
5034       /* We need to unwind the stack by the offset computed by
5035          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5036          to be SP; letting the CFA move during this adjustment
5037          is just as correct as retaining the CFA from the body
5038          of the function.  Therefore, do nothing special.  */
5039       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5040     }
5041
5042   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5043   if (!for_sibcall)
5044     emit_jump_insn (ret_rtx);
5045 }
5046
5047 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5048    normally or return to a previous frame after unwinding.
5049
5050    An EH return uses a single shared return sequence.  The epilogue is
5051    exactly like a normal epilogue except that it has an extra input
5052    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5053    that must be applied after the frame has been destroyed.  An extra label
5054    is inserted before the epilogue which initializes this register to zero,
5055    and this is the entry point for a normal return.
5056
5057    An actual EH return updates the return address, initializes the stack
5058    adjustment and jumps directly into the epilogue (bypassing the zeroing
5059    of the adjustment).  Since the return address is typically saved on the
5060    stack when a function makes a call, the saved LR must be updated outside
5061    the epilogue.
5062
5063    This poses problems as the store is generated well before the epilogue,
5064    so the offset of LR is not known yet.  Also optimizations will remove the
5065    store as it appears dead, even after the epilogue is generated (as the
5066    base or offset for loading LR is different in many cases).
5067
5068    To avoid these problems this implementation forces the frame pointer
5069    in eh_return functions so that the location of LR is fixed and known early.
5070    It also marks the store volatile, so no optimization is permitted to
5071    remove the store.  */
5072 rtx
5073 aarch64_eh_return_handler_rtx (void)
5074 {
5075   rtx tmp = gen_frame_mem (Pmode,
5076     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5077
5078   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5079   MEM_VOLATILE_P (tmp) = true;
5080   return tmp;
5081 }
5082
5083 /* Output code to add DELTA to the first argument, and then jump
5084    to FUNCTION.  Used for C++ multiple inheritance.  */
5085 static void
5086 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5087                          HOST_WIDE_INT delta,
5088                          HOST_WIDE_INT vcall_offset,
5089                          tree function)
5090 {
5091   /* The this pointer is always in x0.  Note that this differs from
5092      Arm where the this pointer maybe bumped to r1 if r0 is required
5093      to return a pointer to an aggregate.  On AArch64 a result value
5094      pointer will be in x8.  */
5095   int this_regno = R0_REGNUM;
5096   rtx this_rtx, temp0, temp1, addr, funexp;
5097   rtx_insn *insn;
5098
5099   reload_completed = 1;
5100   emit_note (NOTE_INSN_PROLOGUE_END);
5101
5102   this_rtx = gen_rtx_REG (Pmode, this_regno);
5103   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5104   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5105
5106   if (vcall_offset == 0)
5107     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5108   else
5109     {
5110       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5111
5112       addr = this_rtx;
5113       if (delta != 0)
5114         {
5115           if (delta >= -256 && delta < 256)
5116             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5117                                        plus_constant (Pmode, this_rtx, delta));
5118           else
5119             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5120                                 temp1, temp0, false);
5121         }
5122
5123       if (Pmode == ptr_mode)
5124         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5125       else
5126         aarch64_emit_move (temp0,
5127                            gen_rtx_ZERO_EXTEND (Pmode,
5128                                                 gen_rtx_MEM (ptr_mode, addr)));
5129
5130       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5131           addr = plus_constant (Pmode, temp0, vcall_offset);
5132       else
5133         {
5134           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5135                                           Pmode);
5136           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5137         }
5138
5139       if (Pmode == ptr_mode)
5140         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5141       else
5142         aarch64_emit_move (temp1,
5143                            gen_rtx_SIGN_EXTEND (Pmode,
5144                                                 gen_rtx_MEM (ptr_mode, addr)));
5145
5146       emit_insn (gen_add2_insn (this_rtx, temp1));
5147     }
5148
5149   /* Generate a tail call to the target function.  */
5150   if (!TREE_USED (function))
5151     {
5152       assemble_external (function);
5153       TREE_USED (function) = 1;
5154     }
5155   funexp = XEXP (DECL_RTL (function), 0);
5156   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5157   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5158   SIBLING_CALL_P (insn) = 1;
5159
5160   insn = get_insns ();
5161   shorten_branches (insn);
5162   final_start_function (insn, file, 1);
5163   final (insn, file, 1);
5164   final_end_function ();
5165
5166   /* Stop pretending to be a post-reload pass.  */
5167   reload_completed = 0;
5168 }
5169
5170 static bool
5171 aarch64_tls_referenced_p (rtx x)
5172 {
5173   if (!TARGET_HAVE_TLS)
5174     return false;
5175   subrtx_iterator::array_type array;
5176   FOR_EACH_SUBRTX (iter, array, x, ALL)
5177     {
5178       const_rtx x = *iter;
5179       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5180         return true;
5181       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5182          TLS offsets, not real symbol references.  */
5183       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5184         iter.skip_subrtxes ();
5185     }
5186   return false;
5187 }
5188
5189
5190 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5191    a left shift of 0 or 12 bits.  */
5192 bool
5193 aarch64_uimm12_shift (HOST_WIDE_INT val)
5194 {
5195   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5196           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5197           );
5198 }
5199
5200
5201 /* Return true if val is an immediate that can be loaded into a
5202    register by a MOVZ instruction.  */
5203 static bool
5204 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5205 {
5206   if (GET_MODE_SIZE (mode) > 4)
5207     {
5208       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5209           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5210         return 1;
5211     }
5212   else
5213     {
5214       /* Ignore sign extension.  */
5215       val &= (HOST_WIDE_INT) 0xffffffff;
5216     }
5217   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5218           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5219 }
5220
5221 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5222    64-bit (DImode) integer.  */
5223
5224 static unsigned HOST_WIDE_INT
5225 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5226 {
5227   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5228   while (size < 64)
5229     {
5230       val &= (HOST_WIDE_INT_1U << size) - 1;
5231       val |= val << size;
5232       size *= 2;
5233     }
5234   return val;
5235 }
5236
5237 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5238
5239 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5240   {
5241     0x0000000100000001ull,
5242     0x0001000100010001ull,
5243     0x0101010101010101ull,
5244     0x1111111111111111ull,
5245     0x5555555555555555ull,
5246   };
5247
5248
5249 /* Return true if val is a valid bitmask immediate.  */
5250
5251 bool
5252 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5253 {
5254   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5255   int bits;
5256
5257   /* Check for a single sequence of one bits and return quickly if so.
5258      The special cases of all ones and all zeroes returns false.  */
5259   val = aarch64_replicate_bitmask_imm (val_in, mode);
5260   tmp = val + (val & -val);
5261
5262   if (tmp == (tmp & -tmp))
5263     return (val + 1) > 1;
5264
5265   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5266   if (mode == SImode)
5267     val = (val << 32) | (val & 0xffffffff);
5268
5269   /* Invert if the immediate doesn't start with a zero bit - this means we
5270      only need to search for sequences of one bits.  */
5271   if (val & 1)
5272     val = ~val;
5273
5274   /* Find the first set bit and set tmp to val with the first sequence of one
5275      bits removed.  Return success if there is a single sequence of ones.  */
5276   first_one = val & -val;
5277   tmp = val & (val + first_one);
5278
5279   if (tmp == 0)
5280     return true;
5281
5282   /* Find the next set bit and compute the difference in bit position.  */
5283   next_one = tmp & -tmp;
5284   bits = clz_hwi (first_one) - clz_hwi (next_one);
5285   mask = val ^ tmp;
5286
5287   /* Check the bit position difference is a power of 2, and that the first
5288      sequence of one bits fits within 'bits' bits.  */
5289   if ((mask >> bits) != 0 || bits != (bits & -bits))
5290     return false;
5291
5292   /* Check the sequence of one bits is repeated 64/bits times.  */
5293   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5294 }
5295
5296 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5297    Assumed precondition: VAL_IN Is not zero.  */
5298
5299 unsigned HOST_WIDE_INT
5300 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5301 {
5302   int lowest_bit_set = ctz_hwi (val_in);
5303   int highest_bit_set = floor_log2 (val_in);
5304   gcc_assert (val_in != 0);
5305
5306   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5307           (HOST_WIDE_INT_1U << lowest_bit_set));
5308 }
5309
5310 /* Create constant where bits outside of lowest bit set to highest bit set
5311    are set to 1.  */
5312
5313 unsigned HOST_WIDE_INT
5314 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5315 {
5316   return val_in | ~aarch64_and_split_imm1 (val_in);
5317 }
5318
5319 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5320
5321 bool
5322 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5323 {
5324   scalar_int_mode int_mode;
5325   if (!is_a <scalar_int_mode> (mode, &int_mode))
5326     return false;
5327
5328   if (aarch64_bitmask_imm (val_in, int_mode))
5329     return false;
5330
5331   if (aarch64_move_imm (val_in, int_mode))
5332     return false;
5333
5334   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5335
5336   return aarch64_bitmask_imm (imm2, int_mode);
5337 }
5338
5339 /* Return true if val is an immediate that can be loaded into a
5340    register in a single instruction.  */
5341 bool
5342 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5343 {
5344   scalar_int_mode int_mode;
5345   if (!is_a <scalar_int_mode> (mode, &int_mode))
5346     return false;
5347
5348   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5349     return 1;
5350   return aarch64_bitmask_imm (val, int_mode);
5351 }
5352
5353 static bool
5354 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5355 {
5356   rtx base, offset;
5357
5358   if (GET_CODE (x) == HIGH)
5359     return true;
5360
5361   /* There's no way to calculate VL-based values using relocations.  */
5362   subrtx_iterator::array_type array;
5363   FOR_EACH_SUBRTX (iter, array, x, ALL)
5364     if (GET_CODE (*iter) == CONST_POLY_INT)
5365       return true;
5366
5367   split_const (x, &base, &offset);
5368   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5369     {
5370       if (aarch64_classify_symbol (base, INTVAL (offset))
5371           != SYMBOL_FORCE_TO_MEM)
5372         return true;
5373       else
5374         /* Avoid generating a 64-bit relocation in ILP32; leave
5375            to aarch64_expand_mov_immediate to handle it properly.  */
5376         return mode != ptr_mode;
5377     }
5378
5379   return aarch64_tls_referenced_p (x);
5380 }
5381
5382 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5383    The expansion for a table switch is quite expensive due to the number
5384    of instructions, the table lookup and hard to predict indirect jump.
5385    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5386    set, otherwise use tables for > 16 cases as a tradeoff between size and
5387    performance.  When optimizing for size, use the default setting.  */
5388
5389 static unsigned int
5390 aarch64_case_values_threshold (void)
5391 {
5392   /* Use the specified limit for the number of cases before using jump
5393      tables at higher optimization levels.  */
5394   if (optimize > 2
5395       && selected_cpu->tune->max_case_values != 0)
5396     return selected_cpu->tune->max_case_values;
5397   else
5398     return optimize_size ? default_case_values_threshold () : 17;
5399 }
5400
5401 /* Return true if register REGNO is a valid index register.
5402    STRICT_P is true if REG_OK_STRICT is in effect.  */
5403
5404 bool
5405 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5406 {
5407   if (!HARD_REGISTER_NUM_P (regno))
5408     {
5409       if (!strict_p)
5410         return true;
5411
5412       if (!reg_renumber)
5413         return false;
5414
5415       regno = reg_renumber[regno];
5416     }
5417   return GP_REGNUM_P (regno);
5418 }
5419
5420 /* Return true if register REGNO is a valid base register for mode MODE.
5421    STRICT_P is true if REG_OK_STRICT is in effect.  */
5422
5423 bool
5424 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5425 {
5426   if (!HARD_REGISTER_NUM_P (regno))
5427     {
5428       if (!strict_p)
5429         return true;
5430
5431       if (!reg_renumber)
5432         return false;
5433
5434       regno = reg_renumber[regno];
5435     }
5436
5437   /* The fake registers will be eliminated to either the stack or
5438      hard frame pointer, both of which are usually valid base registers.
5439      Reload deals with the cases where the eliminated form isn't valid.  */
5440   return (GP_REGNUM_P (regno)
5441           || regno == SP_REGNUM
5442           || regno == FRAME_POINTER_REGNUM
5443           || regno == ARG_POINTER_REGNUM);
5444 }
5445
5446 /* Return true if X is a valid base register for mode MODE.
5447    STRICT_P is true if REG_OK_STRICT is in effect.  */
5448
5449 static bool
5450 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5451 {
5452   if (!strict_p
5453       && GET_CODE (x) == SUBREG
5454       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5455     x = SUBREG_REG (x);
5456
5457   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5458 }
5459
5460 /* Return true if address offset is a valid index.  If it is, fill in INFO
5461    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5462
5463 static bool
5464 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5465                         machine_mode mode, bool strict_p)
5466 {
5467   enum aarch64_address_type type;
5468   rtx index;
5469   int shift;
5470
5471   /* (reg:P) */
5472   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5473       && GET_MODE (x) == Pmode)
5474     {
5475       type = ADDRESS_REG_REG;
5476       index = x;
5477       shift = 0;
5478     }
5479   /* (sign_extend:DI (reg:SI)) */
5480   else if ((GET_CODE (x) == SIGN_EXTEND
5481             || GET_CODE (x) == ZERO_EXTEND)
5482            && GET_MODE (x) == DImode
5483            && GET_MODE (XEXP (x, 0)) == SImode)
5484     {
5485       type = (GET_CODE (x) == SIGN_EXTEND)
5486         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5487       index = XEXP (x, 0);
5488       shift = 0;
5489     }
5490   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5491   else if (GET_CODE (x) == MULT
5492            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5493                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5494            && GET_MODE (XEXP (x, 0)) == DImode
5495            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5496            && CONST_INT_P (XEXP (x, 1)))
5497     {
5498       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5499         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5500       index = XEXP (XEXP (x, 0), 0);
5501       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5502     }
5503   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5504   else if (GET_CODE (x) == ASHIFT
5505            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5506                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5507            && GET_MODE (XEXP (x, 0)) == DImode
5508            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5509            && CONST_INT_P (XEXP (x, 1)))
5510     {
5511       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5512         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5513       index = XEXP (XEXP (x, 0), 0);
5514       shift = INTVAL (XEXP (x, 1));
5515     }
5516   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5517   else if ((GET_CODE (x) == SIGN_EXTRACT
5518             || GET_CODE (x) == ZERO_EXTRACT)
5519            && GET_MODE (x) == DImode
5520            && GET_CODE (XEXP (x, 0)) == MULT
5521            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5522            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5523     {
5524       type = (GET_CODE (x) == SIGN_EXTRACT)
5525         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5526       index = XEXP (XEXP (x, 0), 0);
5527       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5528       if (INTVAL (XEXP (x, 1)) != 32 + shift
5529           || INTVAL (XEXP (x, 2)) != 0)
5530         shift = -1;
5531     }
5532   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5533      (const_int 0xffffffff<<shift)) */
5534   else if (GET_CODE (x) == AND
5535            && GET_MODE (x) == DImode
5536            && GET_CODE (XEXP (x, 0)) == MULT
5537            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5538            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5539            && CONST_INT_P (XEXP (x, 1)))
5540     {
5541       type = ADDRESS_REG_UXTW;
5542       index = XEXP (XEXP (x, 0), 0);
5543       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5544       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5545         shift = -1;
5546     }
5547   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5548   else if ((GET_CODE (x) == SIGN_EXTRACT
5549             || GET_CODE (x) == ZERO_EXTRACT)
5550            && GET_MODE (x) == DImode
5551            && GET_CODE (XEXP (x, 0)) == ASHIFT
5552            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5553            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5554     {
5555       type = (GET_CODE (x) == SIGN_EXTRACT)
5556         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5557       index = XEXP (XEXP (x, 0), 0);
5558       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5559       if (INTVAL (XEXP (x, 1)) != 32 + shift
5560           || INTVAL (XEXP (x, 2)) != 0)
5561         shift = -1;
5562     }
5563   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5564      (const_int 0xffffffff<<shift)) */
5565   else if (GET_CODE (x) == AND
5566            && GET_MODE (x) == DImode
5567            && GET_CODE (XEXP (x, 0)) == ASHIFT
5568            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5569            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5570            && CONST_INT_P (XEXP (x, 1)))
5571     {
5572       type = ADDRESS_REG_UXTW;
5573       index = XEXP (XEXP (x, 0), 0);
5574       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5575       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5576         shift = -1;
5577     }
5578   /* (mult:P (reg:P) (const_int scale)) */
5579   else if (GET_CODE (x) == MULT
5580            && GET_MODE (x) == Pmode
5581            && GET_MODE (XEXP (x, 0)) == Pmode
5582            && CONST_INT_P (XEXP (x, 1)))
5583     {
5584       type = ADDRESS_REG_REG;
5585       index = XEXP (x, 0);
5586       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5587     }
5588   /* (ashift:P (reg:P) (const_int shift)) */
5589   else if (GET_CODE (x) == ASHIFT
5590            && GET_MODE (x) == Pmode
5591            && GET_MODE (XEXP (x, 0)) == Pmode
5592            && CONST_INT_P (XEXP (x, 1)))
5593     {
5594       type = ADDRESS_REG_REG;
5595       index = XEXP (x, 0);
5596       shift = INTVAL (XEXP (x, 1));
5597     }
5598   else
5599     return false;
5600
5601   if (!strict_p
5602       && GET_CODE (index) == SUBREG
5603       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5604     index = SUBREG_REG (index);
5605
5606   if (aarch64_sve_data_mode_p (mode))
5607     {
5608       if (type != ADDRESS_REG_REG
5609           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5610         return false;
5611     }
5612   else
5613     {
5614       if (shift != 0
5615           && !(IN_RANGE (shift, 1, 3)
5616                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5617         return false;
5618     }
5619
5620   if (REG_P (index)
5621       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5622     {
5623       info->type = type;
5624       info->offset = index;
5625       info->shift = shift;
5626       return true;
5627     }
5628
5629   return false;
5630 }
5631
5632 /* Return true if MODE is one of the modes for which we
5633    support LDP/STP operations.  */
5634
5635 static bool
5636 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5637 {
5638   return mode == SImode || mode == DImode
5639          || mode == SFmode || mode == DFmode
5640          || (aarch64_vector_mode_supported_p (mode)
5641              && known_eq (GET_MODE_SIZE (mode), 8));
5642 }
5643
5644 /* Return true if REGNO is a virtual pointer register, or an eliminable
5645    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5646    include stack_pointer or hard_frame_pointer.  */
5647 static bool
5648 virt_or_elim_regno_p (unsigned regno)
5649 {
5650   return ((regno >= FIRST_VIRTUAL_REGISTER
5651            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5652           || regno == FRAME_POINTER_REGNUM
5653           || regno == ARG_POINTER_REGNUM);
5654 }
5655
5656 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5657    If it is, fill in INFO appropriately.  STRICT_P is true if
5658    REG_OK_STRICT is in effect.  */
5659
5660 static bool
5661 aarch64_classify_address (struct aarch64_address_info *info,
5662                           rtx x, machine_mode mode, bool strict_p,
5663                           aarch64_addr_query_type type = ADDR_QUERY_M)
5664 {
5665   enum rtx_code code = GET_CODE (x);
5666   rtx op0, op1;
5667   poly_int64 offset;
5668
5669   HOST_WIDE_INT const_size;
5670
5671   /* On BE, we use load/store pair for all large int mode load/stores.
5672      TI/TFmode may also use a load/store pair.  */
5673   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5674   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5675   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5676                             || mode == TImode
5677                             || mode == TFmode
5678                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5679
5680   bool allow_reg_index_p = (!load_store_pair_p
5681                             && (known_lt (GET_MODE_SIZE (mode), 16)
5682                                 || vec_flags == VEC_ADVSIMD
5683                                 || vec_flags == VEC_SVE_DATA));
5684
5685   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5686      [Rn, #offset, MUL VL].  */
5687   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5688       && (code != REG && code != PLUS))
5689     return false;
5690
5691   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5692      REG addressing.  */
5693   if (advsimd_struct_p
5694       && !BYTES_BIG_ENDIAN
5695       && (code != POST_INC && code != REG))
5696     return false;
5697
5698   gcc_checking_assert (GET_MODE (x) == VOIDmode
5699                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5700
5701   switch (code)
5702     {
5703     case REG:
5704     case SUBREG:
5705       info->type = ADDRESS_REG_IMM;
5706       info->base = x;
5707       info->offset = const0_rtx;
5708       info->const_offset = 0;
5709       return aarch64_base_register_rtx_p (x, strict_p);
5710
5711     case PLUS:
5712       op0 = XEXP (x, 0);
5713       op1 = XEXP (x, 1);
5714
5715       if (! strict_p
5716           && REG_P (op0)
5717           && virt_or_elim_regno_p (REGNO (op0))
5718           && poly_int_rtx_p (op1, &offset))
5719         {
5720           info->type = ADDRESS_REG_IMM;
5721           info->base = op0;
5722           info->offset = op1;
5723           info->const_offset = offset;
5724
5725           return true;
5726         }
5727
5728       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5729           && aarch64_base_register_rtx_p (op0, strict_p)
5730           && poly_int_rtx_p (op1, &offset))
5731         {
5732           info->type = ADDRESS_REG_IMM;
5733           info->base = op0;
5734           info->offset = op1;
5735           info->const_offset = offset;
5736
5737           /* TImode and TFmode values are allowed in both pairs of X
5738              registers and individual Q registers.  The available
5739              address modes are:
5740              X,X: 7-bit signed scaled offset
5741              Q:   9-bit signed offset
5742              We conservatively require an offset representable in either mode.
5743              When performing the check for pairs of X registers i.e.  LDP/STP
5744              pass down DImode since that is the natural size of the LDP/STP
5745              instruction memory accesses.  */
5746           if (mode == TImode || mode == TFmode)
5747             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5748                     && (offset_9bit_signed_unscaled_p (mode, offset)
5749                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5750
5751           /* A 7bit offset check because OImode will emit a ldp/stp
5752              instruction (only big endian will get here).
5753              For ldp/stp instructions, the offset is scaled for the size of a
5754              single element of the pair.  */
5755           if (mode == OImode)
5756             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5757
5758           /* Three 9/12 bit offsets checks because CImode will emit three
5759              ldr/str instructions (only big endian will get here).  */
5760           if (mode == CImode)
5761             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5762                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5763                         || offset_12bit_unsigned_scaled_p (V16QImode,
5764                                                            offset + 32)));
5765
5766           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5767              instructions (only big endian will get here).  */
5768           if (mode == XImode)
5769             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5770                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5771                                                             offset + 32));
5772
5773           /* Make "m" use the LD1 offset range for SVE data modes, so
5774              that pre-RTL optimizers like ivopts will work to that
5775              instead of the wider LDR/STR range.  */
5776           if (vec_flags == VEC_SVE_DATA)
5777             return (type == ADDR_QUERY_M
5778                     ? offset_4bit_signed_scaled_p (mode, offset)
5779                     : offset_9bit_signed_scaled_p (mode, offset));
5780
5781           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5782             {
5783               poly_int64 end_offset = (offset
5784                                        + GET_MODE_SIZE (mode)
5785                                        - BYTES_PER_SVE_VECTOR);
5786               return (type == ADDR_QUERY_M
5787                       ? offset_4bit_signed_scaled_p (mode, offset)
5788                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5789                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5790                                                          end_offset)));
5791             }
5792
5793           if (vec_flags == VEC_SVE_PRED)
5794             return offset_9bit_signed_scaled_p (mode, offset);
5795
5796           if (load_store_pair_p)
5797             return ((known_eq (GET_MODE_SIZE (mode), 4)
5798                      || known_eq (GET_MODE_SIZE (mode), 8))
5799                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5800           else
5801             return (offset_9bit_signed_unscaled_p (mode, offset)
5802                     || offset_12bit_unsigned_scaled_p (mode, offset));
5803         }
5804
5805       if (allow_reg_index_p)
5806         {
5807           /* Look for base + (scaled/extended) index register.  */
5808           if (aarch64_base_register_rtx_p (op0, strict_p)
5809               && aarch64_classify_index (info, op1, mode, strict_p))
5810             {
5811               info->base = op0;
5812               return true;
5813             }
5814           if (aarch64_base_register_rtx_p (op1, strict_p)
5815               && aarch64_classify_index (info, op0, mode, strict_p))
5816             {
5817               info->base = op1;
5818               return true;
5819             }
5820         }
5821
5822       return false;
5823
5824     case POST_INC:
5825     case POST_DEC:
5826     case PRE_INC:
5827     case PRE_DEC:
5828       info->type = ADDRESS_REG_WB;
5829       info->base = XEXP (x, 0);
5830       info->offset = NULL_RTX;
5831       return aarch64_base_register_rtx_p (info->base, strict_p);
5832
5833     case POST_MODIFY:
5834     case PRE_MODIFY:
5835       info->type = ADDRESS_REG_WB;
5836       info->base = XEXP (x, 0);
5837       if (GET_CODE (XEXP (x, 1)) == PLUS
5838           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5839           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5840           && aarch64_base_register_rtx_p (info->base, strict_p))
5841         {
5842           info->offset = XEXP (XEXP (x, 1), 1);
5843           info->const_offset = offset;
5844
5845           /* TImode and TFmode values are allowed in both pairs of X
5846              registers and individual Q registers.  The available
5847              address modes are:
5848              X,X: 7-bit signed scaled offset
5849              Q:   9-bit signed offset
5850              We conservatively require an offset representable in either mode.
5851            */
5852           if (mode == TImode || mode == TFmode)
5853             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5854                     && offset_9bit_signed_unscaled_p (mode, offset));
5855
5856           if (load_store_pair_p)
5857             return ((known_eq (GET_MODE_SIZE (mode), 4)
5858                      || known_eq (GET_MODE_SIZE (mode), 8))
5859                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5860           else
5861             return offset_9bit_signed_unscaled_p (mode, offset);
5862         }
5863       return false;
5864
5865     case CONST:
5866     case SYMBOL_REF:
5867     case LABEL_REF:
5868       /* load literal: pc-relative constant pool entry.  Only supported
5869          for SI mode or larger.  */
5870       info->type = ADDRESS_SYMBOLIC;
5871
5872       if (!load_store_pair_p
5873           && GET_MODE_SIZE (mode).is_constant (&const_size)
5874           && const_size >= 4)
5875         {
5876           rtx sym, addend;
5877
5878           split_const (x, &sym, &addend);
5879           return ((GET_CODE (sym) == LABEL_REF
5880                    || (GET_CODE (sym) == SYMBOL_REF
5881                        && CONSTANT_POOL_ADDRESS_P (sym)
5882                        && aarch64_pcrelative_literal_loads)));
5883         }
5884       return false;
5885
5886     case LO_SUM:
5887       info->type = ADDRESS_LO_SUM;
5888       info->base = XEXP (x, 0);
5889       info->offset = XEXP (x, 1);
5890       if (allow_reg_index_p
5891           && aarch64_base_register_rtx_p (info->base, strict_p))
5892         {
5893           rtx sym, offs;
5894           split_const (info->offset, &sym, &offs);
5895           if (GET_CODE (sym) == SYMBOL_REF
5896               && (aarch64_classify_symbol (sym, INTVAL (offs))
5897                   == SYMBOL_SMALL_ABSOLUTE))
5898             {
5899               /* The symbol and offset must be aligned to the access size.  */
5900               unsigned int align;
5901
5902               if (CONSTANT_POOL_ADDRESS_P (sym))
5903                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5904               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5905                 {
5906                   tree exp = SYMBOL_REF_DECL (sym);
5907                   align = TYPE_ALIGN (TREE_TYPE (exp));
5908                   align = aarch64_constant_alignment (exp, align);
5909                 }
5910               else if (SYMBOL_REF_DECL (sym))
5911                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5912               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5913                        && SYMBOL_REF_BLOCK (sym) != NULL)
5914                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5915               else
5916                 align = BITS_PER_UNIT;
5917
5918               poly_int64 ref_size = GET_MODE_SIZE (mode);
5919               if (known_eq (ref_size, 0))
5920                 ref_size = GET_MODE_SIZE (DImode);
5921
5922               return (multiple_p (INTVAL (offs), ref_size)
5923                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5924             }
5925         }
5926       return false;
5927
5928     default:
5929       return false;
5930     }
5931 }
5932
5933 /* Return true if the address X is valid for a PRFM instruction.
5934    STRICT_P is true if we should do strict checking with
5935    aarch64_classify_address.  */
5936
5937 bool
5938 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5939 {
5940   struct aarch64_address_info addr;
5941
5942   /* PRFM accepts the same addresses as DImode...  */
5943   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5944   if (!res)
5945     return false;
5946
5947   /* ... except writeback forms.  */
5948   return addr.type != ADDRESS_REG_WB;
5949 }
5950
5951 bool
5952 aarch64_symbolic_address_p (rtx x)
5953 {
5954   rtx offset;
5955
5956   split_const (x, &x, &offset);
5957   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5958 }
5959
5960 /* Classify the base of symbolic expression X.  */
5961
5962 enum aarch64_symbol_type
5963 aarch64_classify_symbolic_expression (rtx x)
5964 {
5965   rtx offset;
5966
5967   split_const (x, &x, &offset);
5968   return aarch64_classify_symbol (x, INTVAL (offset));
5969 }
5970
5971
5972 /* Return TRUE if X is a legitimate address for accessing memory in
5973    mode MODE.  */
5974 static bool
5975 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5976 {
5977   struct aarch64_address_info addr;
5978
5979   return aarch64_classify_address (&addr, x, mode, strict_p);
5980 }
5981
5982 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5983    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5984 bool
5985 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5986                               aarch64_addr_query_type type)
5987 {
5988   struct aarch64_address_info addr;
5989
5990   return aarch64_classify_address (&addr, x, mode, strict_p, type);
5991 }
5992
5993 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
5994
5995 static bool
5996 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5997                                          poly_int64 orig_offset,
5998                                          machine_mode mode)
5999 {
6000   HOST_WIDE_INT size;
6001   if (GET_MODE_SIZE (mode).is_constant (&size))
6002     {
6003       HOST_WIDE_INT const_offset, second_offset;
6004
6005       /* A general SVE offset is A * VQ + B.  Remove the A component from
6006          coefficient 0 in order to get the constant B.  */
6007       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6008
6009       /* Split an out-of-range address displacement into a base and
6010          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6011          range otherwise to increase opportunities for sharing the base
6012          address of different sizes.  Unaligned accesses use the signed
6013          9-bit range, TImode/TFmode use the intersection of signed
6014          scaled 7-bit and signed 9-bit offset.  */
6015       if (mode == TImode || mode == TFmode)
6016         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6017       else if ((const_offset & (size - 1)) != 0)
6018         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6019       else
6020         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6021
6022       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6023         return false;
6024
6025       /* Split the offset into second_offset and the rest.  */
6026       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6027       *offset2 = gen_int_mode (second_offset, Pmode);
6028       return true;
6029     }
6030   else
6031     {
6032       /* Get the mode we should use as the basis of the range.  For structure
6033          modes this is the mode of one vector.  */
6034       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6035       machine_mode step_mode
6036         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6037
6038       /* Get the "mul vl" multiplier we'd like to use.  */
6039       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6040       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6041       if (vec_flags & VEC_SVE_DATA)
6042         /* LDR supports a 9-bit range, but the move patterns for
6043            structure modes require all vectors to be in range of the
6044            same base.  The simplest way of accomodating that while still
6045            promoting reuse of anchor points between different modes is
6046            to use an 8-bit range unconditionally.  */
6047         vnum = ((vnum + 128) & 255) - 128;
6048       else
6049         /* Predicates are only handled singly, so we might as well use
6050            the full range.  */
6051         vnum = ((vnum + 256) & 511) - 256;
6052       if (vnum == 0)
6053         return false;
6054
6055       /* Convert the "mul vl" multiplier into a byte offset.  */
6056       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6057       if (known_eq (second_offset, orig_offset))
6058         return false;
6059
6060       /* Split the offset into second_offset and the rest.  */
6061       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6062       *offset2 = gen_int_mode (second_offset, Pmode);
6063       return true;
6064     }
6065 }
6066
6067 /* Return the binary representation of floating point constant VALUE in INTVAL.
6068    If the value cannot be converted, return false without setting INTVAL.
6069    The conversion is done in the given MODE.  */
6070 bool
6071 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6072 {
6073
6074   /* We make a general exception for 0.  */
6075   if (aarch64_float_const_zero_rtx_p (value))
6076     {
6077       *intval = 0;
6078       return true;
6079     }
6080
6081   scalar_float_mode mode;
6082   if (GET_CODE (value) != CONST_DOUBLE
6083       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6084       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6085       /* Only support up to DF mode.  */
6086       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6087     return false;
6088
6089   unsigned HOST_WIDE_INT ival = 0;
6090
6091   long res[2];
6092   real_to_target (res,
6093                   CONST_DOUBLE_REAL_VALUE (value),
6094                   REAL_MODE_FORMAT (mode));
6095
6096   if (mode == DFmode)
6097     {
6098       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6099       ival = zext_hwi (res[order], 32);
6100       ival |= (zext_hwi (res[1 - order], 32) << 32);
6101     }
6102   else
6103       ival = zext_hwi (res[0], 32);
6104
6105   *intval = ival;
6106   return true;
6107 }
6108
6109 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6110    single MOV(+MOVK) followed by an FMOV.  */
6111 bool
6112 aarch64_float_const_rtx_p (rtx x)
6113 {
6114   machine_mode mode = GET_MODE (x);
6115   if (mode == VOIDmode)
6116     return false;
6117
6118   /* Determine whether it's cheaper to write float constants as
6119      mov/movk pairs over ldr/adrp pairs.  */
6120   unsigned HOST_WIDE_INT ival;
6121
6122   if (GET_CODE (x) == CONST_DOUBLE
6123       && SCALAR_FLOAT_MODE_P (mode)
6124       && aarch64_reinterpret_float_as_int (x, &ival))
6125     {
6126       scalar_int_mode imode = (mode == HFmode
6127                                ? SImode
6128                                : int_mode_for_mode (mode).require ());
6129       int num_instr = aarch64_internal_mov_immediate
6130                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6131       return num_instr < 3;
6132     }
6133
6134   return false;
6135 }
6136
6137 /* Return TRUE if rtx X is immediate constant 0.0 */
6138 bool
6139 aarch64_float_const_zero_rtx_p (rtx x)
6140 {
6141   if (GET_MODE (x) == VOIDmode)
6142     return false;
6143
6144   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6145     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6146   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6147 }
6148
6149 /* Return TRUE if rtx X is immediate constant that fits in a single
6150    MOVI immediate operation.  */
6151 bool
6152 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6153 {
6154   if (!TARGET_SIMD)
6155      return false;
6156
6157   machine_mode vmode;
6158   scalar_int_mode imode;
6159   unsigned HOST_WIDE_INT ival;
6160
6161   if (GET_CODE (x) == CONST_DOUBLE
6162       && SCALAR_FLOAT_MODE_P (mode))
6163     {
6164       if (!aarch64_reinterpret_float_as_int (x, &ival))
6165         return false;
6166
6167       /* We make a general exception for 0.  */
6168       if (aarch64_float_const_zero_rtx_p (x))
6169         return true;
6170
6171       imode = int_mode_for_mode (mode).require ();
6172     }
6173   else if (GET_CODE (x) == CONST_INT
6174            && is_a <scalar_int_mode> (mode, &imode))
6175     ival = INTVAL (x);
6176   else
6177     return false;
6178
6179    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6180      a 128 bit vector mode.  */
6181   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6182
6183   vmode = aarch64_simd_container_mode (imode, width);
6184   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6185
6186   return aarch64_simd_valid_immediate (v_op, NULL);
6187 }
6188
6189
6190 /* Return the fixed registers used for condition codes.  */
6191
6192 static bool
6193 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6194 {
6195   *p1 = CC_REGNUM;
6196   *p2 = INVALID_REGNUM;
6197   return true;
6198 }
6199
6200 /* This function is used by the call expanders of the machine description.
6201    RESULT is the register in which the result is returned.  It's NULL for
6202    "call" and "sibcall".
6203    MEM is the location of the function call.
6204    SIBCALL indicates whether this function call is normal call or sibling call.
6205    It will generate different pattern accordingly.  */
6206
6207 void
6208 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6209 {
6210   rtx call, callee, tmp;
6211   rtvec vec;
6212   machine_mode mode;
6213
6214   gcc_assert (MEM_P (mem));
6215   callee = XEXP (mem, 0);
6216   mode = GET_MODE (callee);
6217   gcc_assert (mode == Pmode);
6218
6219   /* Decide if we should generate indirect calls by loading the
6220      address of the callee into a register before performing
6221      the branch-and-link.  */
6222   if (SYMBOL_REF_P (callee)
6223       ? (aarch64_is_long_call_p (callee)
6224          || aarch64_is_noplt_call_p (callee))
6225       : !REG_P (callee))
6226     XEXP (mem, 0) = force_reg (mode, callee);
6227
6228   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6229
6230   if (result != NULL_RTX)
6231     call = gen_rtx_SET (result, call);
6232
6233   if (sibcall)
6234     tmp = ret_rtx;
6235   else
6236     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6237
6238   vec = gen_rtvec (2, call, tmp);
6239   call = gen_rtx_PARALLEL (VOIDmode, vec);
6240
6241   aarch64_emit_call_insn (call);
6242 }
6243
6244 /* Emit call insn with PAT and do aarch64-specific handling.  */
6245
6246 void
6247 aarch64_emit_call_insn (rtx pat)
6248 {
6249   rtx insn = emit_call_insn (pat);
6250
6251   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6252   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6253   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6254 }
6255
6256 machine_mode
6257 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6258 {
6259   /* All floating point compares return CCFP if it is an equality
6260      comparison, and CCFPE otherwise.  */
6261   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6262     {
6263       switch (code)
6264         {
6265         case EQ:
6266         case NE:
6267         case UNORDERED:
6268         case ORDERED:
6269         case UNLT:
6270         case UNLE:
6271         case UNGT:
6272         case UNGE:
6273         case UNEQ:
6274           return CCFPmode;
6275
6276         case LT:
6277         case LE:
6278         case GT:
6279         case GE:
6280         case LTGT:
6281           return CCFPEmode;
6282
6283         default:
6284           gcc_unreachable ();
6285         }
6286     }
6287
6288   /* Equality comparisons of short modes against zero can be performed
6289      using the TST instruction with the appropriate bitmask.  */
6290   if (y == const0_rtx && REG_P (x)
6291       && (code == EQ || code == NE)
6292       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6293     return CC_NZmode;
6294
6295   /* Similarly, comparisons of zero_extends from shorter modes can
6296      be performed using an ANDS with an immediate mask.  */
6297   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6298       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6299       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6300       && (code == EQ || code == NE))
6301     return CC_NZmode;
6302
6303   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6304       && y == const0_rtx
6305       && (code == EQ || code == NE || code == LT || code == GE)
6306       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6307           || GET_CODE (x) == NEG
6308           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6309               && CONST_INT_P (XEXP (x, 2)))))
6310     return CC_NZmode;
6311
6312   /* A compare with a shifted operand.  Because of canonicalization,
6313      the comparison will have to be swapped when we emit the assembly
6314      code.  */
6315   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6316       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6317       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6318           || GET_CODE (x) == LSHIFTRT
6319           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6320     return CC_SWPmode;
6321
6322   /* Similarly for a negated operand, but we can only do this for
6323      equalities.  */
6324   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6325       && (REG_P (y) || GET_CODE (y) == SUBREG)
6326       && (code == EQ || code == NE)
6327       && GET_CODE (x) == NEG)
6328     return CC_Zmode;
6329
6330   /* A test for unsigned overflow.  */
6331   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6332       && code == NE
6333       && GET_CODE (x) == PLUS
6334       && GET_CODE (y) == ZERO_EXTEND)
6335     return CC_Cmode;
6336
6337   /* For everything else, return CCmode.  */
6338   return CCmode;
6339 }
6340
6341 static int
6342 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6343
6344 int
6345 aarch64_get_condition_code (rtx x)
6346 {
6347   machine_mode mode = GET_MODE (XEXP (x, 0));
6348   enum rtx_code comp_code = GET_CODE (x);
6349
6350   if (GET_MODE_CLASS (mode) != MODE_CC)
6351     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6352   return aarch64_get_condition_code_1 (mode, comp_code);
6353 }
6354
6355 static int
6356 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6357 {
6358   switch (mode)
6359     {
6360     case E_CCFPmode:
6361     case E_CCFPEmode:
6362       switch (comp_code)
6363         {
6364         case GE: return AARCH64_GE;
6365         case GT: return AARCH64_GT;
6366         case LE: return AARCH64_LS;
6367         case LT: return AARCH64_MI;
6368         case NE: return AARCH64_NE;
6369         case EQ: return AARCH64_EQ;
6370         case ORDERED: return AARCH64_VC;
6371         case UNORDERED: return AARCH64_VS;
6372         case UNLT: return AARCH64_LT;
6373         case UNLE: return AARCH64_LE;
6374         case UNGT: return AARCH64_HI;
6375         case UNGE: return AARCH64_PL;
6376         default: return -1;
6377         }
6378       break;
6379
6380     case E_CCmode:
6381       switch (comp_code)
6382         {
6383         case NE: return AARCH64_NE;
6384         case EQ: return AARCH64_EQ;
6385         case GE: return AARCH64_GE;
6386         case GT: return AARCH64_GT;
6387         case LE: return AARCH64_LE;
6388         case LT: return AARCH64_LT;
6389         case GEU: return AARCH64_CS;
6390         case GTU: return AARCH64_HI;
6391         case LEU: return AARCH64_LS;
6392         case LTU: return AARCH64_CC;
6393         default: return -1;
6394         }
6395       break;
6396
6397     case E_CC_SWPmode:
6398       switch (comp_code)
6399         {
6400         case NE: return AARCH64_NE;
6401         case EQ: return AARCH64_EQ;
6402         case GE: return AARCH64_LE;
6403         case GT: return AARCH64_LT;
6404         case LE: return AARCH64_GE;
6405         case LT: return AARCH64_GT;
6406         case GEU: return AARCH64_LS;
6407         case GTU: return AARCH64_CC;
6408         case LEU: return AARCH64_CS;
6409         case LTU: return AARCH64_HI;
6410         default: return -1;
6411         }
6412       break;
6413
6414     case E_CC_NZmode:
6415       switch (comp_code)
6416         {
6417         case NE: return AARCH64_NE;
6418         case EQ: return AARCH64_EQ;
6419         case GE: return AARCH64_PL;
6420         case LT: return AARCH64_MI;
6421         default: return -1;
6422         }
6423       break;
6424
6425     case E_CC_Zmode:
6426       switch (comp_code)
6427         {
6428         case NE: return AARCH64_NE;
6429         case EQ: return AARCH64_EQ;
6430         default: return -1;
6431         }
6432       break;
6433
6434     case E_CC_Cmode:
6435       switch (comp_code)
6436         {
6437         case NE: return AARCH64_CS;
6438         case EQ: return AARCH64_CC;
6439         default: return -1;
6440         }
6441       break;
6442
6443     default:
6444       return -1;
6445     }
6446
6447   return -1;
6448 }
6449
6450 bool
6451 aarch64_const_vec_all_same_in_range_p (rtx x,
6452                                        HOST_WIDE_INT minval,
6453                                        HOST_WIDE_INT maxval)
6454 {
6455   rtx elt;
6456   return (const_vec_duplicate_p (x, &elt)
6457           && CONST_INT_P (elt)
6458           && IN_RANGE (INTVAL (elt), minval, maxval));
6459 }
6460
6461 bool
6462 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6463 {
6464   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6465 }
6466
6467 /* Return true if VEC is a constant in which every element is in the range
6468    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6469
6470 static bool
6471 aarch64_const_vec_all_in_range_p (rtx vec,
6472                                   HOST_WIDE_INT minval,
6473                                   HOST_WIDE_INT maxval)
6474 {
6475   if (GET_CODE (vec) != CONST_VECTOR
6476       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6477     return false;
6478
6479   int nunits;
6480   if (!CONST_VECTOR_STEPPED_P (vec))
6481     nunits = const_vector_encoded_nelts (vec);
6482   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6483     return false;
6484
6485   for (int i = 0; i < nunits; i++)
6486     {
6487       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6488       if (!CONST_INT_P (vec_elem)
6489           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6490         return false;
6491     }
6492   return true;
6493 }
6494
6495 /* N Z C V.  */
6496 #define AARCH64_CC_V 1
6497 #define AARCH64_CC_C (1 << 1)
6498 #define AARCH64_CC_Z (1 << 2)
6499 #define AARCH64_CC_N (1 << 3)
6500
6501 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6502 static const int aarch64_nzcv_codes[] =
6503 {
6504   0,            /* EQ, Z == 1.  */
6505   AARCH64_CC_Z, /* NE, Z == 0.  */
6506   0,            /* CS, C == 1.  */
6507   AARCH64_CC_C, /* CC, C == 0.  */
6508   0,            /* MI, N == 1.  */
6509   AARCH64_CC_N, /* PL, N == 0.  */
6510   0,            /* VS, V == 1.  */
6511   AARCH64_CC_V, /* VC, V == 0.  */
6512   0,            /* HI, C ==1 && Z == 0.  */
6513   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6514   AARCH64_CC_V, /* GE, N == V.  */
6515   0,            /* LT, N != V.  */
6516   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6517   0,            /* LE, !(Z == 0 && N == V).  */
6518   0,            /* AL, Any.  */
6519   0             /* NV, Any.  */
6520 };
6521
6522 /* Print floating-point vector immediate operand X to F, negating it
6523    first if NEGATE is true.  Return true on success, false if it isn't
6524    a constant we can handle.  */
6525
6526 static bool
6527 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6528 {
6529   rtx elt;
6530
6531   if (!const_vec_duplicate_p (x, &elt))
6532     return false;
6533
6534   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6535   if (negate)
6536     r = real_value_negate (&r);
6537
6538   /* We only handle the SVE single-bit immediates here.  */
6539   if (real_equal (&r, &dconst0))
6540     asm_fprintf (f, "0.0");
6541   else if (real_equal (&r, &dconst1))
6542     asm_fprintf (f, "1.0");
6543   else if (real_equal (&r, &dconsthalf))
6544     asm_fprintf (f, "0.5");
6545   else
6546     return false;
6547
6548   return true;
6549 }
6550
6551 /* Return the equivalent letter for size.  */
6552 static char
6553 sizetochar (int size)
6554 {
6555   switch (size)
6556     {
6557     case 64: return 'd';
6558     case 32: return 's';
6559     case 16: return 'h';
6560     case 8 : return 'b';
6561     default: gcc_unreachable ();
6562     }
6563 }
6564
6565 /* Print operand X to file F in a target specific manner according to CODE.
6566    The acceptable formatting commands given by CODE are:
6567      'c':               An integer or symbol address without a preceding #
6568                         sign.
6569      'C':               Take the duplicated element in a vector constant
6570                         and print it in hex.
6571      'D':               Take the duplicated element in a vector constant
6572                         and print it as an unsigned integer, in decimal.
6573      'e':               Print the sign/zero-extend size as a character 8->b,
6574                         16->h, 32->w.
6575      'p':               Prints N such that 2^N == X (X must be power of 2 and
6576                         const int).
6577      'P':               Print the number of non-zero bits in X (a const_int).
6578      'H':               Print the higher numbered register of a pair (TImode)
6579                         of regs.
6580      'm':               Print a condition (eq, ne, etc).
6581      'M':               Same as 'm', but invert condition.
6582      'N':               Take the duplicated element in a vector constant
6583                         and print the negative of it in decimal.
6584      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6585      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6586                         The register printed is the FP/SIMD register name
6587                         of X + 0/1/2/3 for S/T/U/V.
6588      'R':               Print a scalar FP/SIMD register name + 1.
6589      'X':               Print bottom 16 bits of integer constant in hex.
6590      'w/x':             Print a general register name or the zero register
6591                         (32-bit or 64-bit).
6592      '0':               Print a normal operand, if it's a general register,
6593                         then we assume DImode.
6594      'k':               Print NZCV for conditional compare instructions.
6595      'A':               Output address constant representing the first
6596                         argument of X, specifying a relocation offset
6597                         if appropriate.
6598      'L':               Output constant address specified by X
6599                         with a relocation offset if appropriate.
6600      'G':               Prints address of X, specifying a PC relative
6601                         relocation mode if appropriate.
6602      'y':               Output address of LDP or STP - this is used for
6603                         some LDP/STPs which don't use a PARALLEL in their
6604                         pattern (so the mode needs to be adjusted).
6605      'z':               Output address of a typical LDP or STP.  */
6606
6607 static void
6608 aarch64_print_operand (FILE *f, rtx x, int code)
6609 {
6610   rtx elt;
6611   switch (code)
6612     {
6613     case 'c':
6614       switch (GET_CODE (x))
6615         {
6616         case CONST_INT:
6617           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6618           break;
6619
6620         case SYMBOL_REF:
6621           output_addr_const (f, x);
6622           break;
6623
6624         case CONST:
6625           if (GET_CODE (XEXP (x, 0)) == PLUS
6626               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6627             {
6628               output_addr_const (f, x);
6629               break;
6630             }
6631           /* Fall through.  */
6632
6633         default:
6634           output_operand_lossage ("unsupported operand for code '%c'", code);
6635         }
6636       break;
6637
6638     case 'e':
6639       {
6640         int n;
6641
6642         if (!CONST_INT_P (x)
6643             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6644           {
6645             output_operand_lossage ("invalid operand for '%%%c'", code);
6646             return;
6647           }
6648
6649         switch (n)
6650           {
6651           case 3:
6652             fputc ('b', f);
6653             break;
6654           case 4:
6655             fputc ('h', f);
6656             break;
6657           case 5:
6658             fputc ('w', f);
6659             break;
6660           default:
6661             output_operand_lossage ("invalid operand for '%%%c'", code);
6662             return;
6663           }
6664       }
6665       break;
6666
6667     case 'p':
6668       {
6669         int n;
6670
6671         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6672           {
6673             output_operand_lossage ("invalid operand for '%%%c'", code);
6674             return;
6675           }
6676
6677         asm_fprintf (f, "%d", n);
6678       }
6679       break;
6680
6681     case 'P':
6682       if (!CONST_INT_P (x))
6683         {
6684           output_operand_lossage ("invalid operand for '%%%c'", code);
6685           return;
6686         }
6687
6688       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6689       break;
6690
6691     case 'H':
6692       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6693         {
6694           output_operand_lossage ("invalid operand for '%%%c'", code);
6695           return;
6696         }
6697
6698       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6699       break;
6700
6701     case 'M':
6702     case 'm':
6703       {
6704         int cond_code;
6705         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6706         if (x == const_true_rtx)
6707           {
6708             if (code == 'M')
6709               fputs ("nv", f);
6710             return;
6711           }
6712
6713         if (!COMPARISON_P (x))
6714           {
6715             output_operand_lossage ("invalid operand for '%%%c'", code);
6716             return;
6717           }
6718
6719         cond_code = aarch64_get_condition_code (x);
6720         gcc_assert (cond_code >= 0);
6721         if (code == 'M')
6722           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6723         fputs (aarch64_condition_codes[cond_code], f);
6724       }
6725       break;
6726
6727     case 'N':
6728       if (!const_vec_duplicate_p (x, &elt))
6729         {
6730           output_operand_lossage ("invalid vector constant");
6731           return;
6732         }
6733
6734       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6735         asm_fprintf (f, "%wd", -INTVAL (elt));
6736       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6737                && aarch64_print_vector_float_operand (f, x, true))
6738         ;
6739       else
6740         {
6741           output_operand_lossage ("invalid vector constant");
6742           return;
6743         }
6744       break;
6745
6746     case 'b':
6747     case 'h':
6748     case 's':
6749     case 'd':
6750     case 'q':
6751       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6752         {
6753           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6754           return;
6755         }
6756       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6757       break;
6758
6759     case 'S':
6760     case 'T':
6761     case 'U':
6762     case 'V':
6763       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6764         {
6765           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6766           return;
6767         }
6768       asm_fprintf (f, "%c%d",
6769                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6770                    REGNO (x) - V0_REGNUM + (code - 'S'));
6771       break;
6772
6773     case 'R':
6774       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6775         {
6776           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6777           return;
6778         }
6779       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6780       break;
6781
6782     case 'X':
6783       if (!CONST_INT_P (x))
6784         {
6785           output_operand_lossage ("invalid operand for '%%%c'", code);
6786           return;
6787         }
6788       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6789       break;
6790
6791     case 'C':
6792       {
6793         /* Print a replicated constant in hex.  */
6794         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6795           {
6796             output_operand_lossage ("invalid operand for '%%%c'", code);
6797             return;
6798           }
6799         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6800         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6801       }
6802       break;
6803
6804     case 'D':
6805       {
6806         /* Print a replicated constant in decimal, treating it as
6807            unsigned.  */
6808         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6809           {
6810             output_operand_lossage ("invalid operand for '%%%c'", code);
6811             return;
6812           }
6813         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6814         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6815       }
6816       break;
6817
6818     case 'w':
6819     case 'x':
6820       if (x == const0_rtx
6821           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6822         {
6823           asm_fprintf (f, "%czr", code);
6824           break;
6825         }
6826
6827       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6828         {
6829           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6830           break;
6831         }
6832
6833       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6834         {
6835           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6836           break;
6837         }
6838
6839       /* Fall through */
6840
6841     case 0:
6842       if (x == NULL)
6843         {
6844           output_operand_lossage ("missing operand");
6845           return;
6846         }
6847
6848       switch (GET_CODE (x))
6849         {
6850         case REG:
6851           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6852             {
6853               if (REG_NREGS (x) == 1)
6854                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6855               else
6856                 {
6857                   char suffix
6858                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6859                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6860                                REGNO (x) - V0_REGNUM, suffix,
6861                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6862                 }
6863             }
6864           else
6865             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6866           break;
6867
6868         case MEM:
6869           output_address (GET_MODE (x), XEXP (x, 0));
6870           break;
6871
6872         case LABEL_REF:
6873         case SYMBOL_REF:
6874           output_addr_const (asm_out_file, x);
6875           break;
6876
6877         case CONST_INT:
6878           asm_fprintf (f, "%wd", INTVAL (x));
6879           break;
6880
6881         case CONST:
6882           if (!VECTOR_MODE_P (GET_MODE (x)))
6883             {
6884               output_addr_const (asm_out_file, x);
6885               break;
6886             }
6887           /* fall through */
6888
6889         case CONST_VECTOR:
6890           if (!const_vec_duplicate_p (x, &elt))
6891             {
6892               output_operand_lossage ("invalid vector constant");
6893               return;
6894             }
6895
6896           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6897             asm_fprintf (f, "%wd", INTVAL (elt));
6898           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6899                    && aarch64_print_vector_float_operand (f, x, false))
6900             ;
6901           else
6902             {
6903               output_operand_lossage ("invalid vector constant");
6904               return;
6905             }
6906           break;
6907
6908         case CONST_DOUBLE:
6909           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6910              be getting CONST_DOUBLEs holding integers.  */
6911           gcc_assert (GET_MODE (x) != VOIDmode);
6912           if (aarch64_float_const_zero_rtx_p (x))
6913             {
6914               fputc ('0', f);
6915               break;
6916             }
6917           else if (aarch64_float_const_representable_p (x))
6918             {
6919 #define buf_size 20
6920               char float_buf[buf_size] = {'\0'};
6921               real_to_decimal_for_mode (float_buf,
6922                                         CONST_DOUBLE_REAL_VALUE (x),
6923                                         buf_size, buf_size,
6924                                         1, GET_MODE (x));
6925               asm_fprintf (asm_out_file, "%s", float_buf);
6926               break;
6927 #undef buf_size
6928             }
6929           output_operand_lossage ("invalid constant");
6930           return;
6931         default:
6932           output_operand_lossage ("invalid operand");
6933           return;
6934         }
6935       break;
6936
6937     case 'A':
6938       if (GET_CODE (x) == HIGH)
6939         x = XEXP (x, 0);
6940
6941       switch (aarch64_classify_symbolic_expression (x))
6942         {
6943         case SYMBOL_SMALL_GOT_4G:
6944           asm_fprintf (asm_out_file, ":got:");
6945           break;
6946
6947         case SYMBOL_SMALL_TLSGD:
6948           asm_fprintf (asm_out_file, ":tlsgd:");
6949           break;
6950
6951         case SYMBOL_SMALL_TLSDESC:
6952           asm_fprintf (asm_out_file, ":tlsdesc:");
6953           break;
6954
6955         case SYMBOL_SMALL_TLSIE:
6956           asm_fprintf (asm_out_file, ":gottprel:");
6957           break;
6958
6959         case SYMBOL_TLSLE24:
6960           asm_fprintf (asm_out_file, ":tprel:");
6961           break;
6962
6963         case SYMBOL_TINY_GOT:
6964           gcc_unreachable ();
6965           break;
6966
6967         default:
6968           break;
6969         }
6970       output_addr_const (asm_out_file, x);
6971       break;
6972
6973     case 'L':
6974       switch (aarch64_classify_symbolic_expression (x))
6975         {
6976         case SYMBOL_SMALL_GOT_4G:
6977           asm_fprintf (asm_out_file, ":lo12:");
6978           break;
6979
6980         case SYMBOL_SMALL_TLSGD:
6981           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6982           break;
6983
6984         case SYMBOL_SMALL_TLSDESC:
6985           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6986           break;
6987
6988         case SYMBOL_SMALL_TLSIE:
6989           asm_fprintf (asm_out_file, ":gottprel_lo12:");
6990           break;
6991
6992         case SYMBOL_TLSLE12:
6993           asm_fprintf (asm_out_file, ":tprel_lo12:");
6994           break;
6995
6996         case SYMBOL_TLSLE24:
6997           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6998           break;
6999
7000         case SYMBOL_TINY_GOT:
7001           asm_fprintf (asm_out_file, ":got:");
7002           break;
7003
7004         case SYMBOL_TINY_TLSIE:
7005           asm_fprintf (asm_out_file, ":gottprel:");
7006           break;
7007
7008         default:
7009           break;
7010         }
7011       output_addr_const (asm_out_file, x);
7012       break;
7013
7014     case 'G':
7015       switch (aarch64_classify_symbolic_expression (x))
7016         {
7017         case SYMBOL_TLSLE24:
7018           asm_fprintf (asm_out_file, ":tprel_hi12:");
7019           break;
7020         default:
7021           break;
7022         }
7023       output_addr_const (asm_out_file, x);
7024       break;
7025
7026     case 'k':
7027       {
7028         HOST_WIDE_INT cond_code;
7029
7030         if (!CONST_INT_P (x))
7031           {
7032             output_operand_lossage ("invalid operand for '%%%c'", code);
7033             return;
7034           }
7035
7036         cond_code = INTVAL (x);
7037         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7038         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7039       }
7040       break;
7041
7042     case 'y':
7043     case 'z':
7044       {
7045         machine_mode mode = GET_MODE (x);
7046
7047         if (GET_CODE (x) != MEM
7048             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7049           {
7050             output_operand_lossage ("invalid operand for '%%%c'", code);
7051             return;
7052           }
7053
7054         if (code == 'y')
7055           /* LDP/STP which uses a single double-width memory operand.
7056              Adjust the mode to appear like a typical LDP/STP.
7057              Currently this is supported for 16-byte accesses only.  */
7058           mode = DFmode;
7059
7060         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7061           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7062       }
7063       break;
7064
7065     default:
7066       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7067       return;
7068     }
7069 }
7070
7071 /* Print address 'x' of a memory access with mode 'mode'.
7072    'op' is the context required by aarch64_classify_address.  It can either be
7073    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7074 static bool
7075 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7076                                 aarch64_addr_query_type type)
7077 {
7078   struct aarch64_address_info addr;
7079   unsigned int size;
7080
7081   /* Check all addresses are Pmode - including ILP32.  */
7082   if (GET_MODE (x) != Pmode)
7083     output_operand_lossage ("invalid address mode");
7084
7085   if (aarch64_classify_address (&addr, x, mode, true, type))
7086     switch (addr.type)
7087       {
7088       case ADDRESS_REG_IMM:
7089         if (known_eq (addr.const_offset, 0))
7090           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7091         else if (aarch64_sve_data_mode_p (mode))
7092           {
7093             HOST_WIDE_INT vnum
7094               = exact_div (addr.const_offset,
7095                            BYTES_PER_SVE_VECTOR).to_constant ();
7096             asm_fprintf (f, "[%s, #%wd, mul vl]",
7097                          reg_names[REGNO (addr.base)], vnum);
7098           }
7099         else if (aarch64_sve_pred_mode_p (mode))
7100           {
7101             HOST_WIDE_INT vnum
7102               = exact_div (addr.const_offset,
7103                            BYTES_PER_SVE_PRED).to_constant ();
7104             asm_fprintf (f, "[%s, #%wd, mul vl]",
7105                          reg_names[REGNO (addr.base)], vnum);
7106           }
7107         else
7108           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7109                        INTVAL (addr.offset));
7110         return true;
7111
7112       case ADDRESS_REG_REG:
7113         if (addr.shift == 0)
7114           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7115                        reg_names [REGNO (addr.offset)]);
7116         else
7117           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7118                        reg_names [REGNO (addr.offset)], addr.shift);
7119         return true;
7120
7121       case ADDRESS_REG_UXTW:
7122         if (addr.shift == 0)
7123           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7124                        REGNO (addr.offset) - R0_REGNUM);
7125         else
7126           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7127                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7128         return true;
7129
7130       case ADDRESS_REG_SXTW:
7131         if (addr.shift == 0)
7132           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7133                        REGNO (addr.offset) - R0_REGNUM);
7134         else
7135           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7136                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7137         return true;
7138
7139       case ADDRESS_REG_WB:
7140         /* Writeback is only supported for fixed-width modes.  */
7141         size = GET_MODE_SIZE (mode).to_constant ();
7142         switch (GET_CODE (x))
7143           {
7144           case PRE_INC:
7145             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7146             return true;
7147           case POST_INC:
7148             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7149             return true;
7150           case PRE_DEC:
7151             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7152             return true;
7153           case POST_DEC:
7154             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7155             return true;
7156           case PRE_MODIFY:
7157             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7158                          INTVAL (addr.offset));
7159             return true;
7160           case POST_MODIFY:
7161             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7162                          INTVAL (addr.offset));
7163             return true;
7164           default:
7165             break;
7166           }
7167         break;
7168
7169       case ADDRESS_LO_SUM:
7170         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7171         output_addr_const (f, addr.offset);
7172         asm_fprintf (f, "]");
7173         return true;
7174
7175       case ADDRESS_SYMBOLIC:
7176         output_addr_const (f, x);
7177         return true;
7178       }
7179
7180   return false;
7181 }
7182
7183 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7184 static bool
7185 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7186 {
7187   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7188 }
7189
7190 /* Print address 'x' of a memory access with mode 'mode'.  */
7191 static void
7192 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7193 {
7194   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7195     output_addr_const (f, x);
7196 }
7197
7198 bool
7199 aarch64_label_mentioned_p (rtx x)
7200 {
7201   const char *fmt;
7202   int i;
7203
7204   if (GET_CODE (x) == LABEL_REF)
7205     return true;
7206
7207   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7208      referencing instruction, but they are constant offsets, not
7209      symbols.  */
7210   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7211     return false;
7212
7213   fmt = GET_RTX_FORMAT (GET_CODE (x));
7214   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7215     {
7216       if (fmt[i] == 'E')
7217         {
7218           int j;
7219
7220           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7221             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7222               return 1;
7223         }
7224       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7225         return 1;
7226     }
7227
7228   return 0;
7229 }
7230
7231 /* Implement REGNO_REG_CLASS.  */
7232
7233 enum reg_class
7234 aarch64_regno_regclass (unsigned regno)
7235 {
7236   if (GP_REGNUM_P (regno))
7237     return GENERAL_REGS;
7238
7239   if (regno == SP_REGNUM)
7240     return STACK_REG;
7241
7242   if (regno == FRAME_POINTER_REGNUM
7243       || regno == ARG_POINTER_REGNUM)
7244     return POINTER_REGS;
7245
7246   if (FP_REGNUM_P (regno))
7247     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7248
7249   if (PR_REGNUM_P (regno))
7250     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7251
7252   return NO_REGS;
7253 }
7254
7255 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7256    If OFFSET is out of range, return an offset of an anchor point
7257    that is in range.  Return 0 otherwise.  */
7258
7259 static HOST_WIDE_INT
7260 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7261                        machine_mode mode)
7262 {
7263   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7264   if (size > 16)
7265     return (offset + 0x400) & ~0x7f0;
7266
7267   /* For offsets that aren't a multiple of the access size, the limit is
7268      -256...255.  */
7269   if (offset & (size - 1))
7270     {
7271       /* BLKmode typically uses LDP of X-registers.  */
7272       if (mode == BLKmode)
7273         return (offset + 512) & ~0x3ff;
7274       return (offset + 0x100) & ~0x1ff;
7275     }
7276
7277   /* Small negative offsets are supported.  */
7278   if (IN_RANGE (offset, -256, 0))
7279     return 0;
7280
7281   if (mode == TImode || mode == TFmode)
7282     return (offset + 0x100) & ~0x1ff;
7283
7284   /* Use 12-bit offset by access size.  */
7285   return offset & (~0xfff * size);
7286 }
7287
7288 static rtx
7289 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7290 {
7291   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7292      where mask is selected by alignment and size of the offset.
7293      We try to pick as large a range for the offset as possible to
7294      maximize the chance of a CSE.  However, for aligned addresses
7295      we limit the range to 4k so that structures with different sized
7296      elements are likely to use the same base.  We need to be careful
7297      not to split a CONST for some forms of address expression, otherwise
7298      it will generate sub-optimal code.  */
7299
7300   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7301     {
7302       rtx base = XEXP (x, 0);
7303       rtx offset_rtx = XEXP (x, 1);
7304       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7305
7306       if (GET_CODE (base) == PLUS)
7307         {
7308           rtx op0 = XEXP (base, 0);
7309           rtx op1 = XEXP (base, 1);
7310
7311           /* Force any scaling into a temp for CSE.  */
7312           op0 = force_reg (Pmode, op0);
7313           op1 = force_reg (Pmode, op1);
7314
7315           /* Let the pointer register be in op0.  */
7316           if (REG_POINTER (op1))
7317             std::swap (op0, op1);
7318
7319           /* If the pointer is virtual or frame related, then we know that
7320              virtual register instantiation or register elimination is going
7321              to apply a second constant.  We want the two constants folded
7322              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7323           if (virt_or_elim_regno_p (REGNO (op0)))
7324             {
7325               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7326                                    NULL_RTX, true, OPTAB_DIRECT);
7327               return gen_rtx_PLUS (Pmode, base, op1);
7328             }
7329
7330           /* Otherwise, in order to encourage CSE (and thence loop strength
7331              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7332           base = expand_binop (Pmode, add_optab, op0, op1,
7333                                NULL_RTX, true, OPTAB_DIRECT);
7334           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7335         }
7336
7337       HOST_WIDE_INT size;
7338       if (GET_MODE_SIZE (mode).is_constant (&size))
7339         {
7340           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7341                                                              mode);
7342           if (base_offset != 0)
7343             {
7344               base = plus_constant (Pmode, base, base_offset);
7345               base = force_operand (base, NULL_RTX);
7346               return plus_constant (Pmode, base, offset - base_offset);
7347             }
7348         }
7349     }
7350
7351   return x;
7352 }
7353
7354 /* Return the reload icode required for a constant pool in mode.  */
7355 static enum insn_code
7356 aarch64_constant_pool_reload_icode (machine_mode mode)
7357 {
7358   switch (mode)
7359     {
7360     case E_SFmode:
7361       return CODE_FOR_aarch64_reload_movcpsfdi;
7362
7363     case E_DFmode:
7364       return CODE_FOR_aarch64_reload_movcpdfdi;
7365
7366     case E_TFmode:
7367       return CODE_FOR_aarch64_reload_movcptfdi;
7368
7369     case E_V8QImode:
7370       return CODE_FOR_aarch64_reload_movcpv8qidi;
7371
7372     case E_V16QImode:
7373       return CODE_FOR_aarch64_reload_movcpv16qidi;
7374
7375     case E_V4HImode:
7376       return CODE_FOR_aarch64_reload_movcpv4hidi;
7377
7378     case E_V8HImode:
7379       return CODE_FOR_aarch64_reload_movcpv8hidi;
7380
7381     case E_V2SImode:
7382       return CODE_FOR_aarch64_reload_movcpv2sidi;
7383
7384     case E_V4SImode:
7385       return CODE_FOR_aarch64_reload_movcpv4sidi;
7386
7387     case E_V2DImode:
7388       return CODE_FOR_aarch64_reload_movcpv2didi;
7389
7390     case E_V2DFmode:
7391       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7392
7393     default:
7394       gcc_unreachable ();
7395     }
7396
7397   gcc_unreachable ();
7398 }
7399 static reg_class_t
7400 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7401                           reg_class_t rclass,
7402                           machine_mode mode,
7403                           secondary_reload_info *sri)
7404 {
7405   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7406      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7407      comment at the head of aarch64-sve.md for more details about the
7408      big-endian handling.  */
7409   if (BYTES_BIG_ENDIAN
7410       && reg_class_subset_p (rclass, FP_REGS)
7411       && !((REG_P (x) && HARD_REGISTER_P (x))
7412            || aarch64_simd_valid_immediate (x, NULL))
7413       && aarch64_sve_data_mode_p (mode))
7414     {
7415       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7416       return NO_REGS;
7417     }
7418
7419   /* If we have to disable direct literal pool loads and stores because the
7420      function is too big, then we need a scratch register.  */
7421   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7422       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7423           || targetm.vector_mode_supported_p (GET_MODE (x)))
7424       && !aarch64_pcrelative_literal_loads)
7425     {
7426       sri->icode = aarch64_constant_pool_reload_icode (mode);
7427       return NO_REGS;
7428     }
7429
7430   /* Without the TARGET_SIMD instructions we cannot move a Q register
7431      to a Q register directly.  We need a scratch.  */
7432   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7433       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7434       && reg_class_subset_p (rclass, FP_REGS))
7435     {
7436       if (mode == TFmode)
7437         sri->icode = CODE_FOR_aarch64_reload_movtf;
7438       else if (mode == TImode)
7439         sri->icode = CODE_FOR_aarch64_reload_movti;
7440       return NO_REGS;
7441     }
7442
7443   /* A TFmode or TImode memory access should be handled via an FP_REGS
7444      because AArch64 has richer addressing modes for LDR/STR instructions
7445      than LDP/STP instructions.  */
7446   if (TARGET_FLOAT && rclass == GENERAL_REGS
7447       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7448     return FP_REGS;
7449
7450   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7451       return GENERAL_REGS;
7452
7453   return NO_REGS;
7454 }
7455
7456 static bool
7457 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7458 {
7459   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7460
7461   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7462      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7463   if (frame_pointer_needed)
7464     return to == HARD_FRAME_POINTER_REGNUM;
7465   return true;
7466 }
7467
7468 poly_int64
7469 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7470 {
7471   aarch64_layout_frame ();
7472
7473   if (to == HARD_FRAME_POINTER_REGNUM)
7474     {
7475       if (from == ARG_POINTER_REGNUM)
7476         return cfun->machine->frame.hard_fp_offset;
7477
7478       if (from == FRAME_POINTER_REGNUM)
7479         return cfun->machine->frame.hard_fp_offset
7480                - cfun->machine->frame.locals_offset;
7481     }
7482
7483   if (to == STACK_POINTER_REGNUM)
7484     {
7485       if (from == FRAME_POINTER_REGNUM)
7486           return cfun->machine->frame.frame_size
7487                  - cfun->machine->frame.locals_offset;
7488     }
7489
7490   return cfun->machine->frame.frame_size;
7491 }
7492
7493 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7494    previous frame.  */
7495
7496 rtx
7497 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7498 {
7499   if (count != 0)
7500     return const0_rtx;
7501   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7502 }
7503
7504
7505 static void
7506 aarch64_asm_trampoline_template (FILE *f)
7507 {
7508   if (TARGET_ILP32)
7509     {
7510       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7511       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7512     }
7513   else
7514     {
7515       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7516       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7517     }
7518   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7519   assemble_aligned_integer (4, const0_rtx);
7520   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7521   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7522 }
7523
7524 static void
7525 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7526 {
7527   rtx fnaddr, mem, a_tramp;
7528   const int tramp_code_sz = 16;
7529
7530   /* Don't need to copy the trailing D-words, we fill those in below.  */
7531   emit_block_move (m_tramp, assemble_trampoline_template (),
7532                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7533   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7534   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7535   if (GET_MODE (fnaddr) != ptr_mode)
7536     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7537   emit_move_insn (mem, fnaddr);
7538
7539   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7540   emit_move_insn (mem, chain_value);
7541
7542   /* XXX We should really define a "clear_cache" pattern and use
7543      gen_clear_cache().  */
7544   a_tramp = XEXP (m_tramp, 0);
7545   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7546                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7547                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7548                      ptr_mode);
7549 }
7550
7551 static unsigned char
7552 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7553 {
7554   /* ??? Logically we should only need to provide a value when
7555      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7556      can hold MODE, but at the moment we need to handle all modes.
7557      Just ignore any runtime parts for registers that can't store them.  */
7558   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7559   unsigned int nregs;
7560   switch (regclass)
7561     {
7562     case TAILCALL_ADDR_REGS:
7563     case POINTER_REGS:
7564     case GENERAL_REGS:
7565     case ALL_REGS:
7566     case POINTER_AND_FP_REGS:
7567     case FP_REGS:
7568     case FP_LO_REGS:
7569       if (aarch64_sve_data_mode_p (mode)
7570           && constant_multiple_p (GET_MODE_SIZE (mode),
7571                                   BYTES_PER_SVE_VECTOR, &nregs))
7572         return nregs;
7573       return (aarch64_vector_data_mode_p (mode)
7574               ? CEIL (lowest_size, UNITS_PER_VREG)
7575               : CEIL (lowest_size, UNITS_PER_WORD));
7576     case STACK_REG:
7577     case PR_REGS:
7578     case PR_LO_REGS:
7579     case PR_HI_REGS:
7580       return 1;
7581
7582     case NO_REGS:
7583       return 0;
7584
7585     default:
7586       break;
7587     }
7588   gcc_unreachable ();
7589 }
7590
7591 static reg_class_t
7592 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7593 {
7594   if (regclass == POINTER_REGS)
7595     return GENERAL_REGS;
7596
7597   if (regclass == STACK_REG)
7598     {
7599       if (REG_P(x)
7600           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7601           return regclass;
7602
7603       return NO_REGS;
7604     }
7605
7606   /* Register eliminiation can result in a request for
7607      SP+constant->FP_REGS.  We cannot support such operations which
7608      use SP as source and an FP_REG as destination, so reject out
7609      right now.  */
7610   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7611     {
7612       rtx lhs = XEXP (x, 0);
7613
7614       /* Look through a possible SUBREG introduced by ILP32.  */
7615       if (GET_CODE (lhs) == SUBREG)
7616         lhs = SUBREG_REG (lhs);
7617
7618       gcc_assert (REG_P (lhs));
7619       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7620                                       POINTER_REGS));
7621       return NO_REGS;
7622     }
7623
7624   return regclass;
7625 }
7626
7627 void
7628 aarch64_asm_output_labelref (FILE* f, const char *name)
7629 {
7630   asm_fprintf (f, "%U%s", name);
7631 }
7632
7633 static void
7634 aarch64_elf_asm_constructor (rtx symbol, int priority)
7635 {
7636   if (priority == DEFAULT_INIT_PRIORITY)
7637     default_ctor_section_asm_out_constructor (symbol, priority);
7638   else
7639     {
7640       section *s;
7641       /* While priority is known to be in range [0, 65535], so 18 bytes
7642          would be enough, the compiler might not know that.  To avoid
7643          -Wformat-truncation false positive, use a larger size.  */
7644       char buf[23];
7645       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7646       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7647       switch_to_section (s);
7648       assemble_align (POINTER_SIZE);
7649       assemble_aligned_integer (POINTER_BYTES, symbol);
7650     }
7651 }
7652
7653 static void
7654 aarch64_elf_asm_destructor (rtx symbol, int priority)
7655 {
7656   if (priority == DEFAULT_INIT_PRIORITY)
7657     default_dtor_section_asm_out_destructor (symbol, priority);
7658   else
7659     {
7660       section *s;
7661       /* While priority is known to be in range [0, 65535], so 18 bytes
7662          would be enough, the compiler might not know that.  To avoid
7663          -Wformat-truncation false positive, use a larger size.  */
7664       char buf[23];
7665       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7666       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7667       switch_to_section (s);
7668       assemble_align (POINTER_SIZE);
7669       assemble_aligned_integer (POINTER_BYTES, symbol);
7670     }
7671 }
7672
7673 const char*
7674 aarch64_output_casesi (rtx *operands)
7675 {
7676   char buf[100];
7677   char label[100];
7678   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7679   int index;
7680   static const char *const patterns[4][2] =
7681   {
7682     {
7683       "ldrb\t%w3, [%0,%w1,uxtw]",
7684       "add\t%3, %4, %w3, sxtb #2"
7685     },
7686     {
7687       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7688       "add\t%3, %4, %w3, sxth #2"
7689     },
7690     {
7691       "ldr\t%w3, [%0,%w1,uxtw #2]",
7692       "add\t%3, %4, %w3, sxtw #2"
7693     },
7694     /* We assume that DImode is only generated when not optimizing and
7695        that we don't really need 64-bit address offsets.  That would
7696        imply an object file with 8GB of code in a single function!  */
7697     {
7698       "ldr\t%w3, [%0,%w1,uxtw #2]",
7699       "add\t%3, %4, %w3, sxtw #2"
7700     }
7701   };
7702
7703   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7704
7705   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7706   index = exact_log2 (GET_MODE_SIZE (mode));
7707
7708   gcc_assert (index >= 0 && index <= 3);
7709
7710   /* Need to implement table size reduction, by chaning the code below.  */
7711   output_asm_insn (patterns[index][0], operands);
7712   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7713   snprintf (buf, sizeof (buf),
7714             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7715   output_asm_insn (buf, operands);
7716   output_asm_insn (patterns[index][1], operands);
7717   output_asm_insn ("br\t%3", operands);
7718   assemble_label (asm_out_file, label);
7719   return "";
7720 }
7721
7722
7723 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7724    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7725    operator.  */
7726
7727 int
7728 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7729 {
7730   if (shift >= 0 && shift <= 3)
7731     {
7732       int size;
7733       for (size = 8; size <= 32; size *= 2)
7734         {
7735           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7736           if (mask == bits << shift)
7737             return size;
7738         }
7739     }
7740   return 0;
7741 }
7742
7743 /* Constant pools are per function only when PC relative
7744    literal loads are true or we are in the large memory
7745    model.  */
7746
7747 static inline bool
7748 aarch64_can_use_per_function_literal_pools_p (void)
7749 {
7750   return (aarch64_pcrelative_literal_loads
7751           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7752 }
7753
7754 static bool
7755 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7756 {
7757   /* We can't use blocks for constants when we're using a per-function
7758      constant pool.  */
7759   return !aarch64_can_use_per_function_literal_pools_p ();
7760 }
7761
7762 /* Select appropriate section for constants depending
7763    on where we place literal pools.  */
7764
7765 static section *
7766 aarch64_select_rtx_section (machine_mode mode,
7767                             rtx x,
7768                             unsigned HOST_WIDE_INT align)
7769 {
7770   if (aarch64_can_use_per_function_literal_pools_p ())
7771     return function_section (current_function_decl);
7772
7773   return default_elf_select_rtx_section (mode, x, align);
7774 }
7775
7776 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7777 void
7778 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7779                                   HOST_WIDE_INT offset)
7780 {
7781   /* When using per-function literal pools, we must ensure that any code
7782      section is aligned to the minimal instruction length, lest we get
7783      errors from the assembler re "unaligned instructions".  */
7784   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7785     ASM_OUTPUT_ALIGN (f, 2);
7786 }
7787
7788 /* Costs.  */
7789
7790 /* Helper function for rtx cost calculation.  Strip a shift expression
7791    from X.  Returns the inner operand if successful, or the original
7792    expression on failure.  */
7793 static rtx
7794 aarch64_strip_shift (rtx x)
7795 {
7796   rtx op = x;
7797
7798   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7799      we can convert both to ROR during final output.  */
7800   if ((GET_CODE (op) == ASHIFT
7801        || GET_CODE (op) == ASHIFTRT
7802        || GET_CODE (op) == LSHIFTRT
7803        || GET_CODE (op) == ROTATERT
7804        || GET_CODE (op) == ROTATE)
7805       && CONST_INT_P (XEXP (op, 1)))
7806     return XEXP (op, 0);
7807
7808   if (GET_CODE (op) == MULT
7809       && CONST_INT_P (XEXP (op, 1))
7810       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7811     return XEXP (op, 0);
7812
7813   return x;
7814 }
7815
7816 /* Helper function for rtx cost calculation.  Strip an extend
7817    expression from X.  Returns the inner operand if successful, or the
7818    original expression on failure.  We deal with a number of possible
7819    canonicalization variations here. If STRIP_SHIFT is true, then
7820    we can strip off a shift also.  */
7821 static rtx
7822 aarch64_strip_extend (rtx x, bool strip_shift)
7823 {
7824   scalar_int_mode mode;
7825   rtx op = x;
7826
7827   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7828     return op;
7829
7830   /* Zero and sign extraction of a widened value.  */
7831   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7832       && XEXP (op, 2) == const0_rtx
7833       && GET_CODE (XEXP (op, 0)) == MULT
7834       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7835                                          XEXP (op, 1)))
7836     return XEXP (XEXP (op, 0), 0);
7837
7838   /* It can also be represented (for zero-extend) as an AND with an
7839      immediate.  */
7840   if (GET_CODE (op) == AND
7841       && GET_CODE (XEXP (op, 0)) == MULT
7842       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7843       && CONST_INT_P (XEXP (op, 1))
7844       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7845                            INTVAL (XEXP (op, 1))) != 0)
7846     return XEXP (XEXP (op, 0), 0);
7847
7848   /* Now handle extended register, as this may also have an optional
7849      left shift by 1..4.  */
7850   if (strip_shift
7851       && GET_CODE (op) == ASHIFT
7852       && CONST_INT_P (XEXP (op, 1))
7853       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7854     op = XEXP (op, 0);
7855
7856   if (GET_CODE (op) == ZERO_EXTEND
7857       || GET_CODE (op) == SIGN_EXTEND)
7858     op = XEXP (op, 0);
7859
7860   if (op != x)
7861     return op;
7862
7863   return x;
7864 }
7865
7866 /* Return true iff CODE is a shift supported in combination
7867    with arithmetic instructions.  */
7868
7869 static bool
7870 aarch64_shift_p (enum rtx_code code)
7871 {
7872   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7873 }
7874
7875
7876 /* Return true iff X is a cheap shift without a sign extend. */
7877
7878 static bool
7879 aarch64_cheap_mult_shift_p (rtx x)
7880 {
7881   rtx op0, op1;
7882
7883   op0 = XEXP (x, 0);
7884   op1 = XEXP (x, 1);
7885
7886   if (!(aarch64_tune_params.extra_tuning_flags
7887                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7888     return false;
7889
7890   if (GET_CODE (op0) == SIGN_EXTEND)
7891     return false;
7892
7893   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7894       && UINTVAL (op1) <= 4)
7895     return true;
7896
7897   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7898     return false;
7899
7900   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7901
7902   if (l2 > 0 && l2 <= 4)
7903     return true;
7904
7905   return false;
7906 }
7907
7908 /* Helper function for rtx cost calculation.  Calculate the cost of
7909    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7910    Return the calculated cost of the expression, recursing manually in to
7911    operands where needed.  */
7912
7913 static int
7914 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7915 {
7916   rtx op0, op1;
7917   const struct cpu_cost_table *extra_cost
7918     = aarch64_tune_params.insn_extra_cost;
7919   int cost = 0;
7920   bool compound_p = (outer == PLUS || outer == MINUS);
7921   machine_mode mode = GET_MODE (x);
7922
7923   gcc_checking_assert (code == MULT);
7924
7925   op0 = XEXP (x, 0);
7926   op1 = XEXP (x, 1);
7927
7928   if (VECTOR_MODE_P (mode))
7929     mode = GET_MODE_INNER (mode);
7930
7931   /* Integer multiply/fma.  */
7932   if (GET_MODE_CLASS (mode) == MODE_INT)
7933     {
7934       /* The multiply will be canonicalized as a shift, cost it as such.  */
7935       if (aarch64_shift_p (GET_CODE (x))
7936           || (CONST_INT_P (op1)
7937               && exact_log2 (INTVAL (op1)) > 0))
7938         {
7939           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7940                            || GET_CODE (op0) == SIGN_EXTEND;
7941           if (speed)
7942             {
7943               if (compound_p)
7944                 {
7945                   /* If the shift is considered cheap,
7946                      then don't add any cost. */
7947                   if (aarch64_cheap_mult_shift_p (x))
7948                     ;
7949                   else if (REG_P (op1))
7950                     /* ARITH + shift-by-register.  */
7951                     cost += extra_cost->alu.arith_shift_reg;
7952                   else if (is_extend)
7953                     /* ARITH + extended register.  We don't have a cost field
7954                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7955                     cost += extra_cost->alu.extend_arith;
7956                   else
7957                     /* ARITH + shift-by-immediate.  */
7958                     cost += extra_cost->alu.arith_shift;
7959                 }
7960               else
7961                 /* LSL (immediate).  */
7962                 cost += extra_cost->alu.shift;
7963
7964             }
7965           /* Strip extends as we will have costed them in the case above.  */
7966           if (is_extend)
7967             op0 = aarch64_strip_extend (op0, true);
7968
7969           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7970
7971           return cost;
7972         }
7973
7974       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7975          compound and let the below cases handle it.  After all, MNEG is a
7976          special-case alias of MSUB.  */
7977       if (GET_CODE (op0) == NEG)
7978         {
7979           op0 = XEXP (op0, 0);
7980           compound_p = true;
7981         }
7982
7983       /* Integer multiplies or FMAs have zero/sign extending variants.  */
7984       if ((GET_CODE (op0) == ZERO_EXTEND
7985            && GET_CODE (op1) == ZERO_EXTEND)
7986           || (GET_CODE (op0) == SIGN_EXTEND
7987               && GET_CODE (op1) == SIGN_EXTEND))
7988         {
7989           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7990           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7991
7992           if (speed)
7993             {
7994               if (compound_p)
7995                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
7996                 cost += extra_cost->mult[0].extend_add;
7997               else
7998                 /* MUL/SMULL/UMULL.  */
7999                 cost += extra_cost->mult[0].extend;
8000             }
8001
8002           return cost;
8003         }
8004
8005       /* This is either an integer multiply or a MADD.  In both cases
8006          we want to recurse and cost the operands.  */
8007       cost += rtx_cost (op0, mode, MULT, 0, speed);
8008       cost += rtx_cost (op1, mode, MULT, 1, speed);
8009
8010       if (speed)
8011         {
8012           if (compound_p)
8013             /* MADD/MSUB.  */
8014             cost += extra_cost->mult[mode == DImode].add;
8015           else
8016             /* MUL.  */
8017             cost += extra_cost->mult[mode == DImode].simple;
8018         }
8019
8020       return cost;
8021     }
8022   else
8023     {
8024       if (speed)
8025         {
8026           /* Floating-point FMA/FMUL can also support negations of the
8027              operands, unless the rounding mode is upward or downward in
8028              which case FNMUL is different than FMUL with operand negation.  */
8029           bool neg0 = GET_CODE (op0) == NEG;
8030           bool neg1 = GET_CODE (op1) == NEG;
8031           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8032             {
8033               if (neg0)
8034                 op0 = XEXP (op0, 0);
8035               if (neg1)
8036                 op1 = XEXP (op1, 0);
8037             }
8038
8039           if (compound_p)
8040             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8041             cost += extra_cost->fp[mode == DFmode].fma;
8042           else
8043             /* FMUL/FNMUL.  */
8044             cost += extra_cost->fp[mode == DFmode].mult;
8045         }
8046
8047       cost += rtx_cost (op0, mode, MULT, 0, speed);
8048       cost += rtx_cost (op1, mode, MULT, 1, speed);
8049       return cost;
8050     }
8051 }
8052
8053 static int
8054 aarch64_address_cost (rtx x,
8055                       machine_mode mode,
8056                       addr_space_t as ATTRIBUTE_UNUSED,
8057                       bool speed)
8058 {
8059   enum rtx_code c = GET_CODE (x);
8060   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8061   struct aarch64_address_info info;
8062   int cost = 0;
8063   info.shift = 0;
8064
8065   if (!aarch64_classify_address (&info, x, mode, false))
8066     {
8067       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8068         {
8069           /* This is a CONST or SYMBOL ref which will be split
8070              in a different way depending on the code model in use.
8071              Cost it through the generic infrastructure.  */
8072           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8073           /* Divide through by the cost of one instruction to
8074              bring it to the same units as the address costs.  */
8075           cost_symbol_ref /= COSTS_N_INSNS (1);
8076           /* The cost is then the cost of preparing the address,
8077              followed by an immediate (possibly 0) offset.  */
8078           return cost_symbol_ref + addr_cost->imm_offset;
8079         }
8080       else
8081         {
8082           /* This is most likely a jump table from a case
8083              statement.  */
8084           return addr_cost->register_offset;
8085         }
8086     }
8087
8088   switch (info.type)
8089     {
8090       case ADDRESS_LO_SUM:
8091       case ADDRESS_SYMBOLIC:
8092       case ADDRESS_REG_IMM:
8093         cost += addr_cost->imm_offset;
8094         break;
8095
8096       case ADDRESS_REG_WB:
8097         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8098           cost += addr_cost->pre_modify;
8099         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8100           cost += addr_cost->post_modify;
8101         else
8102           gcc_unreachable ();
8103
8104         break;
8105
8106       case ADDRESS_REG_REG:
8107         cost += addr_cost->register_offset;
8108         break;
8109
8110       case ADDRESS_REG_SXTW:
8111         cost += addr_cost->register_sextend;
8112         break;
8113
8114       case ADDRESS_REG_UXTW:
8115         cost += addr_cost->register_zextend;
8116         break;
8117
8118       default:
8119         gcc_unreachable ();
8120     }
8121
8122
8123   if (info.shift > 0)
8124     {
8125       /* For the sake of calculating the cost of the shifted register
8126          component, we can treat same sized modes in the same way.  */
8127       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8128         cost += addr_cost->addr_scale_costs.hi;
8129       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8130         cost += addr_cost->addr_scale_costs.si;
8131       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8132         cost += addr_cost->addr_scale_costs.di;
8133       else
8134         /* We can't tell, or this is a 128-bit vector.  */
8135         cost += addr_cost->addr_scale_costs.ti;
8136     }
8137
8138   return cost;
8139 }
8140
8141 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8142    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8143    to be taken.  */
8144
8145 int
8146 aarch64_branch_cost (bool speed_p, bool predictable_p)
8147 {
8148   /* When optimizing for speed, use the cost of unpredictable branches.  */
8149   const struct cpu_branch_cost *branch_costs =
8150     aarch64_tune_params.branch_costs;
8151
8152   if (!speed_p || predictable_p)
8153     return branch_costs->predictable;
8154   else
8155     return branch_costs->unpredictable;
8156 }
8157
8158 /* Return true if the RTX X in mode MODE is a zero or sign extract
8159    usable in an ADD or SUB (extended register) instruction.  */
8160 static bool
8161 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8162 {
8163   /* Catch add with a sign extract.
8164      This is add_<optab><mode>_multp2.  */
8165   if (GET_CODE (x) == SIGN_EXTRACT
8166       || GET_CODE (x) == ZERO_EXTRACT)
8167     {
8168       rtx op0 = XEXP (x, 0);
8169       rtx op1 = XEXP (x, 1);
8170       rtx op2 = XEXP (x, 2);
8171
8172       if (GET_CODE (op0) == MULT
8173           && CONST_INT_P (op1)
8174           && op2 == const0_rtx
8175           && CONST_INT_P (XEXP (op0, 1))
8176           && aarch64_is_extend_from_extract (mode,
8177                                              XEXP (op0, 1),
8178                                              op1))
8179         {
8180           return true;
8181         }
8182     }
8183   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8184      No shift.  */
8185   else if (GET_CODE (x) == SIGN_EXTEND
8186            || GET_CODE (x) == ZERO_EXTEND)
8187     return REG_P (XEXP (x, 0));
8188
8189   return false;
8190 }
8191
8192 static bool
8193 aarch64_frint_unspec_p (unsigned int u)
8194 {
8195   switch (u)
8196     {
8197       case UNSPEC_FRINTZ:
8198       case UNSPEC_FRINTP:
8199       case UNSPEC_FRINTM:
8200       case UNSPEC_FRINTA:
8201       case UNSPEC_FRINTN:
8202       case UNSPEC_FRINTX:
8203       case UNSPEC_FRINTI:
8204         return true;
8205
8206       default:
8207         return false;
8208     }
8209 }
8210
8211 /* Return true iff X is an rtx that will match an extr instruction
8212    i.e. as described in the *extr<mode>5_insn family of patterns.
8213    OP0 and OP1 will be set to the operands of the shifts involved
8214    on success and will be NULL_RTX otherwise.  */
8215
8216 static bool
8217 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8218 {
8219   rtx op0, op1;
8220   scalar_int_mode mode;
8221   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8222     return false;
8223
8224   *res_op0 = NULL_RTX;
8225   *res_op1 = NULL_RTX;
8226
8227   if (GET_CODE (x) != IOR)
8228     return false;
8229
8230   op0 = XEXP (x, 0);
8231   op1 = XEXP (x, 1);
8232
8233   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8234       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8235     {
8236      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8237       if (GET_CODE (op1) == ASHIFT)
8238         std::swap (op0, op1);
8239
8240       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8241         return false;
8242
8243       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8244       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8245
8246       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8247           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8248         {
8249           *res_op0 = XEXP (op0, 0);
8250           *res_op1 = XEXP (op1, 0);
8251           return true;
8252         }
8253     }
8254
8255   return false;
8256 }
8257
8258 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8259    storing it in *COST.  Result is true if the total cost of the operation
8260    has now been calculated.  */
8261 static bool
8262 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8263 {
8264   rtx inner;
8265   rtx comparator;
8266   enum rtx_code cmpcode;
8267
8268   if (COMPARISON_P (op0))
8269     {
8270       inner = XEXP (op0, 0);
8271       comparator = XEXP (op0, 1);
8272       cmpcode = GET_CODE (op0);
8273     }
8274   else
8275     {
8276       inner = op0;
8277       comparator = const0_rtx;
8278       cmpcode = NE;
8279     }
8280
8281   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8282     {
8283       /* Conditional branch.  */
8284       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8285         return true;
8286       else
8287         {
8288           if (cmpcode == NE || cmpcode == EQ)
8289             {
8290               if (comparator == const0_rtx)
8291                 {
8292                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8293                   if (GET_CODE (inner) == ZERO_EXTRACT)
8294                     /* TBZ/TBNZ.  */
8295                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8296                                        ZERO_EXTRACT, 0, speed);
8297                   else
8298                     /* CBZ/CBNZ.  */
8299                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8300
8301                 return true;
8302               }
8303             }
8304           else if (cmpcode == LT || cmpcode == GE)
8305             {
8306               /* TBZ/TBNZ.  */
8307               if (comparator == const0_rtx)
8308                 return true;
8309             }
8310         }
8311     }
8312   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8313     {
8314       /* CCMP.  */
8315       if (GET_CODE (op1) == COMPARE)
8316         {
8317           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8318           if (XEXP (op1, 1) == const0_rtx)
8319             *cost += 1;
8320           if (speed)
8321             {
8322               machine_mode mode = GET_MODE (XEXP (op1, 0));
8323               const struct cpu_cost_table *extra_cost
8324                 = aarch64_tune_params.insn_extra_cost;
8325
8326               if (GET_MODE_CLASS (mode) == MODE_INT)
8327                 *cost += extra_cost->alu.arith;
8328               else
8329                 *cost += extra_cost->fp[mode == DFmode].compare;
8330             }
8331           return true;
8332         }
8333
8334       /* It's a conditional operation based on the status flags,
8335          so it must be some flavor of CSEL.  */
8336
8337       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8338       if (GET_CODE (op1) == NEG
8339           || GET_CODE (op1) == NOT
8340           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8341         op1 = XEXP (op1, 0);
8342       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8343         {
8344           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8345           op1 = XEXP (op1, 0);
8346           op2 = XEXP (op2, 0);
8347         }
8348
8349       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8350       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8351       return true;
8352     }
8353
8354   /* We don't know what this is, cost all operands.  */
8355   return false;
8356 }
8357
8358 /* Check whether X is a bitfield operation of the form shift + extend that
8359    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8360    operand to which the bitfield operation is applied.  Otherwise return
8361    NULL_RTX.  */
8362
8363 static rtx
8364 aarch64_extend_bitfield_pattern_p (rtx x)
8365 {
8366   rtx_code outer_code = GET_CODE (x);
8367   machine_mode outer_mode = GET_MODE (x);
8368
8369   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8370       && outer_mode != SImode && outer_mode != DImode)
8371     return NULL_RTX;
8372
8373   rtx inner = XEXP (x, 0);
8374   rtx_code inner_code = GET_CODE (inner);
8375   machine_mode inner_mode = GET_MODE (inner);
8376   rtx op = NULL_RTX;
8377
8378   switch (inner_code)
8379     {
8380       case ASHIFT:
8381         if (CONST_INT_P (XEXP (inner, 1))
8382             && (inner_mode == QImode || inner_mode == HImode))
8383           op = XEXP (inner, 0);
8384         break;
8385       case LSHIFTRT:
8386         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8387             && (inner_mode == QImode || inner_mode == HImode))
8388           op = XEXP (inner, 0);
8389         break;
8390       case ASHIFTRT:
8391         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8392             && (inner_mode == QImode || inner_mode == HImode))
8393           op = XEXP (inner, 0);
8394         break;
8395       default:
8396         break;
8397     }
8398
8399   return op;
8400 }
8401
8402 /* Return true if the mask and a shift amount from an RTX of the form
8403    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8404    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8405
8406 bool
8407 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8408                                     rtx shft_amnt)
8409 {
8410   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8411          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8412          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8413          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8414 }
8415
8416 /* Calculate the cost of calculating X, storing it in *COST.  Result
8417    is true if the total cost of the operation has now been calculated.  */
8418 static bool
8419 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8420                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8421 {
8422   rtx op0, op1, op2;
8423   const struct cpu_cost_table *extra_cost
8424     = aarch64_tune_params.insn_extra_cost;
8425   int code = GET_CODE (x);
8426   scalar_int_mode int_mode;
8427
8428   /* By default, assume that everything has equivalent cost to the
8429      cheapest instruction.  Any additional costs are applied as a delta
8430      above this default.  */
8431   *cost = COSTS_N_INSNS (1);
8432
8433   switch (code)
8434     {
8435     case SET:
8436       /* The cost depends entirely on the operands to SET.  */
8437       *cost = 0;
8438       op0 = SET_DEST (x);
8439       op1 = SET_SRC (x);
8440
8441       switch (GET_CODE (op0))
8442         {
8443         case MEM:
8444           if (speed)
8445             {
8446               rtx address = XEXP (op0, 0);
8447               if (VECTOR_MODE_P (mode))
8448                 *cost += extra_cost->ldst.storev;
8449               else if (GET_MODE_CLASS (mode) == MODE_INT)
8450                 *cost += extra_cost->ldst.store;
8451               else if (mode == SFmode)
8452                 *cost += extra_cost->ldst.storef;
8453               else if (mode == DFmode)
8454                 *cost += extra_cost->ldst.stored;
8455
8456               *cost +=
8457                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8458                                                      0, speed));
8459             }
8460
8461           *cost += rtx_cost (op1, mode, SET, 1, speed);
8462           return true;
8463
8464         case SUBREG:
8465           if (! REG_P (SUBREG_REG (op0)))
8466             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8467
8468           /* Fall through.  */
8469         case REG:
8470           /* The cost is one per vector-register copied.  */
8471           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8472             {
8473               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8474               *cost = COSTS_N_INSNS (nregs);
8475             }
8476           /* const0_rtx is in general free, but we will use an
8477              instruction to set a register to 0.  */
8478           else if (REG_P (op1) || op1 == const0_rtx)
8479             {
8480               /* The cost is 1 per register copied.  */
8481               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8482               *cost = COSTS_N_INSNS (nregs);
8483             }
8484           else
8485             /* Cost is just the cost of the RHS of the set.  */
8486             *cost += rtx_cost (op1, mode, SET, 1, speed);
8487           return true;
8488
8489         case ZERO_EXTRACT:
8490         case SIGN_EXTRACT:
8491           /* Bit-field insertion.  Strip any redundant widening of
8492              the RHS to meet the width of the target.  */
8493           if (GET_CODE (op1) == SUBREG)
8494             op1 = SUBREG_REG (op1);
8495           if ((GET_CODE (op1) == ZERO_EXTEND
8496                || GET_CODE (op1) == SIGN_EXTEND)
8497               && CONST_INT_P (XEXP (op0, 1))
8498               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8499               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8500             op1 = XEXP (op1, 0);
8501
8502           if (CONST_INT_P (op1))
8503             {
8504               /* MOV immediate is assumed to always be cheap.  */
8505               *cost = COSTS_N_INSNS (1);
8506             }
8507           else
8508             {
8509               /* BFM.  */
8510               if (speed)
8511                 *cost += extra_cost->alu.bfi;
8512               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8513             }
8514
8515           return true;
8516
8517         default:
8518           /* We can't make sense of this, assume default cost.  */
8519           *cost = COSTS_N_INSNS (1);
8520           return false;
8521         }
8522       return false;
8523
8524     case CONST_INT:
8525       /* If an instruction can incorporate a constant within the
8526          instruction, the instruction's expression avoids calling
8527          rtx_cost() on the constant.  If rtx_cost() is called on a
8528          constant, then it is usually because the constant must be
8529          moved into a register by one or more instructions.
8530
8531          The exception is constant 0, which can be expressed
8532          as XZR/WZR and is therefore free.  The exception to this is
8533          if we have (set (reg) (const0_rtx)) in which case we must cost
8534          the move.  However, we can catch that when we cost the SET, so
8535          we don't need to consider that here.  */
8536       if (x == const0_rtx)
8537         *cost = 0;
8538       else
8539         {
8540           /* To an approximation, building any other constant is
8541              proportionally expensive to the number of instructions
8542              required to build that constant.  This is true whether we
8543              are compiling for SPEED or otherwise.  */
8544           if (!is_a <scalar_int_mode> (mode, &int_mode))
8545             int_mode = word_mode;
8546           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8547                                  (NULL_RTX, x, false, int_mode));
8548         }
8549       return true;
8550
8551     case CONST_DOUBLE:
8552
8553       /* First determine number of instructions to do the move
8554           as an integer constant.  */
8555       if (!aarch64_float_const_representable_p (x)
8556            && !aarch64_can_const_movi_rtx_p (x, mode)
8557            && aarch64_float_const_rtx_p (x))
8558         {
8559           unsigned HOST_WIDE_INT ival;
8560           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8561           gcc_assert (succeed);
8562
8563           scalar_int_mode imode = (mode == HFmode
8564                                    ? SImode
8565                                    : int_mode_for_mode (mode).require ());
8566           int ncost = aarch64_internal_mov_immediate
8567                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8568           *cost += COSTS_N_INSNS (ncost);
8569           return true;
8570         }
8571
8572       if (speed)
8573         {
8574           /* mov[df,sf]_aarch64.  */
8575           if (aarch64_float_const_representable_p (x))
8576             /* FMOV (scalar immediate).  */
8577             *cost += extra_cost->fp[mode == DFmode].fpconst;
8578           else if (!aarch64_float_const_zero_rtx_p (x))
8579             {
8580               /* This will be a load from memory.  */
8581               if (mode == DFmode)
8582                 *cost += extra_cost->ldst.loadd;
8583               else
8584                 *cost += extra_cost->ldst.loadf;
8585             }
8586           else
8587             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8588                or MOV v0.s[0], wzr - neither of which are modeled by the
8589                cost tables.  Just use the default cost.  */
8590             {
8591             }
8592         }
8593
8594       return true;
8595
8596     case MEM:
8597       if (speed)
8598         {
8599           /* For loads we want the base cost of a load, plus an
8600              approximation for the additional cost of the addressing
8601              mode.  */
8602           rtx address = XEXP (x, 0);
8603           if (VECTOR_MODE_P (mode))
8604             *cost += extra_cost->ldst.loadv;
8605           else if (GET_MODE_CLASS (mode) == MODE_INT)
8606             *cost += extra_cost->ldst.load;
8607           else if (mode == SFmode)
8608             *cost += extra_cost->ldst.loadf;
8609           else if (mode == DFmode)
8610             *cost += extra_cost->ldst.loadd;
8611
8612           *cost +=
8613                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8614                                                      0, speed));
8615         }
8616
8617       return true;
8618
8619     case NEG:
8620       op0 = XEXP (x, 0);
8621
8622       if (VECTOR_MODE_P (mode))
8623         {
8624           if (speed)
8625             {
8626               /* FNEG.  */
8627               *cost += extra_cost->vect.alu;
8628             }
8629           return false;
8630         }
8631
8632       if (GET_MODE_CLASS (mode) == MODE_INT)
8633         {
8634           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8635               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8636             {
8637               /* CSETM.  */
8638               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8639               return true;
8640             }
8641
8642           /* Cost this as SUB wzr, X.  */
8643           op0 = CONST0_RTX (mode);
8644           op1 = XEXP (x, 0);
8645           goto cost_minus;
8646         }
8647
8648       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8649         {
8650           /* Support (neg(fma...)) as a single instruction only if
8651              sign of zeros is unimportant.  This matches the decision
8652              making in aarch64.md.  */
8653           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8654             {
8655               /* FNMADD.  */
8656               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8657               return true;
8658             }
8659           if (GET_CODE (op0) == MULT)
8660             {
8661               /* FNMUL.  */
8662               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8663               return true;
8664             }
8665           if (speed)
8666             /* FNEG.  */
8667             *cost += extra_cost->fp[mode == DFmode].neg;
8668           return false;
8669         }
8670
8671       return false;
8672
8673     case CLRSB:
8674     case CLZ:
8675       if (speed)
8676         {
8677           if (VECTOR_MODE_P (mode))
8678             *cost += extra_cost->vect.alu;
8679           else
8680             *cost += extra_cost->alu.clz;
8681         }
8682
8683       return false;
8684
8685     case COMPARE:
8686       op0 = XEXP (x, 0);
8687       op1 = XEXP (x, 1);
8688
8689       if (op1 == const0_rtx
8690           && GET_CODE (op0) == AND)
8691         {
8692           x = op0;
8693           mode = GET_MODE (op0);
8694           goto cost_logic;
8695         }
8696
8697       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8698         {
8699           /* TODO: A write to the CC flags possibly costs extra, this
8700              needs encoding in the cost tables.  */
8701
8702           mode = GET_MODE (op0);
8703           /* ANDS.  */
8704           if (GET_CODE (op0) == AND)
8705             {
8706               x = op0;
8707               goto cost_logic;
8708             }
8709
8710           if (GET_CODE (op0) == PLUS)
8711             {
8712               /* ADDS (and CMN alias).  */
8713               x = op0;
8714               goto cost_plus;
8715             }
8716
8717           if (GET_CODE (op0) == MINUS)
8718             {
8719               /* SUBS.  */
8720               x = op0;
8721               goto cost_minus;
8722             }
8723
8724           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8725               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8726               && CONST_INT_P (XEXP (op0, 2)))
8727             {
8728               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8729                  Handle it here directly rather than going to cost_logic
8730                  since we know the immediate generated for the TST is valid
8731                  so we can avoid creating an intermediate rtx for it only
8732                  for costing purposes.  */
8733               if (speed)
8734                 *cost += extra_cost->alu.logical;
8735
8736               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8737                                  ZERO_EXTRACT, 0, speed);
8738               return true;
8739             }
8740
8741           if (GET_CODE (op1) == NEG)
8742             {
8743               /* CMN.  */
8744               if (speed)
8745                 *cost += extra_cost->alu.arith;
8746
8747               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8748               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8749               return true;
8750             }
8751
8752           /* CMP.
8753
8754              Compare can freely swap the order of operands, and
8755              canonicalization puts the more complex operation first.
8756              But the integer MINUS logic expects the shift/extend
8757              operation in op1.  */
8758           if (! (REG_P (op0)
8759                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8760           {
8761             op0 = XEXP (x, 1);
8762             op1 = XEXP (x, 0);
8763           }
8764           goto cost_minus;
8765         }
8766
8767       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8768         {
8769           /* FCMP.  */
8770           if (speed)
8771             *cost += extra_cost->fp[mode == DFmode].compare;
8772
8773           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8774             {
8775               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8776               /* FCMP supports constant 0.0 for no extra cost. */
8777               return true;
8778             }
8779           return false;
8780         }
8781
8782       if (VECTOR_MODE_P (mode))
8783         {
8784           /* Vector compare.  */
8785           if (speed)
8786             *cost += extra_cost->vect.alu;
8787
8788           if (aarch64_float_const_zero_rtx_p (op1))
8789             {
8790               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8791                  cost.  */
8792               return true;
8793             }
8794           return false;
8795         }
8796       return false;
8797
8798     case MINUS:
8799       {
8800         op0 = XEXP (x, 0);
8801         op1 = XEXP (x, 1);
8802
8803 cost_minus:
8804         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8805
8806         /* Detect valid immediates.  */
8807         if ((GET_MODE_CLASS (mode) == MODE_INT
8808              || (GET_MODE_CLASS (mode) == MODE_CC
8809                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8810             && CONST_INT_P (op1)
8811             && aarch64_uimm12_shift (INTVAL (op1)))
8812           {
8813             if (speed)
8814               /* SUB(S) (immediate).  */
8815               *cost += extra_cost->alu.arith;
8816             return true;
8817           }
8818
8819         /* Look for SUB (extended register).  */
8820         if (is_a <scalar_int_mode> (mode, &int_mode)
8821             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8822           {
8823             if (speed)
8824               *cost += extra_cost->alu.extend_arith;
8825
8826             op1 = aarch64_strip_extend (op1, true);
8827             *cost += rtx_cost (op1, VOIDmode,
8828                                (enum rtx_code) GET_CODE (op1), 0, speed);
8829             return true;
8830           }
8831
8832         rtx new_op1 = aarch64_strip_extend (op1, false);
8833
8834         /* Cost this as an FMA-alike operation.  */
8835         if ((GET_CODE (new_op1) == MULT
8836              || aarch64_shift_p (GET_CODE (new_op1)))
8837             && code != COMPARE)
8838           {
8839             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8840                                             (enum rtx_code) code,
8841                                             speed);
8842             return true;
8843           }
8844
8845         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8846
8847         if (speed)
8848           {
8849             if (VECTOR_MODE_P (mode))
8850               {
8851                 /* Vector SUB.  */
8852                 *cost += extra_cost->vect.alu;
8853               }
8854             else if (GET_MODE_CLASS (mode) == MODE_INT)
8855               {
8856                 /* SUB(S).  */
8857                 *cost += extra_cost->alu.arith;
8858               }
8859             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8860               {
8861                 /* FSUB.  */
8862                 *cost += extra_cost->fp[mode == DFmode].addsub;
8863               }
8864           }
8865         return true;
8866       }
8867
8868     case PLUS:
8869       {
8870         rtx new_op0;
8871
8872         op0 = XEXP (x, 0);
8873         op1 = XEXP (x, 1);
8874
8875 cost_plus:
8876         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8877             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8878           {
8879             /* CSINC.  */
8880             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8881             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8882             return true;
8883           }
8884
8885         if (GET_MODE_CLASS (mode) == MODE_INT
8886             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8887                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8888           {
8889             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8890
8891             if (speed)
8892               /* ADD (immediate).  */
8893               *cost += extra_cost->alu.arith;
8894             return true;
8895           }
8896
8897         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8898
8899         /* Look for ADD (extended register).  */
8900         if (is_a <scalar_int_mode> (mode, &int_mode)
8901             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8902           {
8903             if (speed)
8904               *cost += extra_cost->alu.extend_arith;
8905
8906             op0 = aarch64_strip_extend (op0, true);
8907             *cost += rtx_cost (op0, VOIDmode,
8908                                (enum rtx_code) GET_CODE (op0), 0, speed);
8909             return true;
8910           }
8911
8912         /* Strip any extend, leave shifts behind as we will
8913            cost them through mult_cost.  */
8914         new_op0 = aarch64_strip_extend (op0, false);
8915
8916         if (GET_CODE (new_op0) == MULT
8917             || aarch64_shift_p (GET_CODE (new_op0)))
8918           {
8919             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8920                                             speed);
8921             return true;
8922           }
8923
8924         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8925
8926         if (speed)
8927           {
8928             if (VECTOR_MODE_P (mode))
8929               {
8930                 /* Vector ADD.  */
8931                 *cost += extra_cost->vect.alu;
8932               }
8933             else if (GET_MODE_CLASS (mode) == MODE_INT)
8934               {
8935                 /* ADD.  */
8936                 *cost += extra_cost->alu.arith;
8937               }
8938             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8939               {
8940                 /* FADD.  */
8941                 *cost += extra_cost->fp[mode == DFmode].addsub;
8942               }
8943           }
8944         return true;
8945       }
8946
8947     case BSWAP:
8948       *cost = COSTS_N_INSNS (1);
8949
8950       if (speed)
8951         {
8952           if (VECTOR_MODE_P (mode))
8953             *cost += extra_cost->vect.alu;
8954           else
8955             *cost += extra_cost->alu.rev;
8956         }
8957       return false;
8958
8959     case IOR:
8960       if (aarch_rev16_p (x))
8961         {
8962           *cost = COSTS_N_INSNS (1);
8963
8964           if (speed)
8965             {
8966               if (VECTOR_MODE_P (mode))
8967                 *cost += extra_cost->vect.alu;
8968               else
8969                 *cost += extra_cost->alu.rev;
8970             }
8971           return true;
8972         }
8973
8974       if (aarch64_extr_rtx_p (x, &op0, &op1))
8975         {
8976           *cost += rtx_cost (op0, mode, IOR, 0, speed);
8977           *cost += rtx_cost (op1, mode, IOR, 1, speed);
8978           if (speed)
8979             *cost += extra_cost->alu.shift;
8980
8981           return true;
8982         }
8983     /* Fall through.  */
8984     case XOR:
8985     case AND:
8986     cost_logic:
8987       op0 = XEXP (x, 0);
8988       op1 = XEXP (x, 1);
8989
8990       if (VECTOR_MODE_P (mode))
8991         {
8992           if (speed)
8993             *cost += extra_cost->vect.alu;
8994           return true;
8995         }
8996
8997       if (code == AND
8998           && GET_CODE (op0) == MULT
8999           && CONST_INT_P (XEXP (op0, 1))
9000           && CONST_INT_P (op1)
9001           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9002                                INTVAL (op1)) != 0)
9003         {
9004           /* This is a UBFM/SBFM.  */
9005           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9006           if (speed)
9007             *cost += extra_cost->alu.bfx;
9008           return true;
9009         }
9010
9011       if (is_int_mode (mode, &int_mode))
9012         {
9013           if (CONST_INT_P (op1))
9014             {
9015               /* We have a mask + shift version of a UBFIZ
9016                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9017               if (GET_CODE (op0) == ASHIFT
9018                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9019                                                          XEXP (op0, 1)))
9020                 {
9021                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9022                                      (enum rtx_code) code, 0, speed);
9023                   if (speed)
9024                     *cost += extra_cost->alu.bfx;
9025
9026                   return true;
9027                 }
9028               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9029                 {
9030                 /* We possibly get the immediate for free, this is not
9031                    modelled.  */
9032                   *cost += rtx_cost (op0, int_mode,
9033                                      (enum rtx_code) code, 0, speed);
9034                   if (speed)
9035                     *cost += extra_cost->alu.logical;
9036
9037                   return true;
9038                 }
9039             }
9040           else
9041             {
9042               rtx new_op0 = op0;
9043
9044               /* Handle ORN, EON, or BIC.  */
9045               if (GET_CODE (op0) == NOT)
9046                 op0 = XEXP (op0, 0);
9047
9048               new_op0 = aarch64_strip_shift (op0);
9049
9050               /* If we had a shift on op0 then this is a logical-shift-
9051                  by-register/immediate operation.  Otherwise, this is just
9052                  a logical operation.  */
9053               if (speed)
9054                 {
9055                   if (new_op0 != op0)
9056                     {
9057                       /* Shift by immediate.  */
9058                       if (CONST_INT_P (XEXP (op0, 1)))
9059                         *cost += extra_cost->alu.log_shift;
9060                       else
9061                         *cost += extra_cost->alu.log_shift_reg;
9062                     }
9063                   else
9064                     *cost += extra_cost->alu.logical;
9065                 }
9066
9067               /* In both cases we want to cost both operands.  */
9068               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9069                                  0, speed);
9070               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9071                                  1, speed);
9072
9073               return true;
9074             }
9075         }
9076       return false;
9077
9078     case NOT:
9079       x = XEXP (x, 0);
9080       op0 = aarch64_strip_shift (x);
9081
9082       if (VECTOR_MODE_P (mode))
9083         {
9084           /* Vector NOT.  */
9085           *cost += extra_cost->vect.alu;
9086           return false;
9087         }
9088
9089       /* MVN-shifted-reg.  */
9090       if (op0 != x)
9091         {
9092           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9093
9094           if (speed)
9095             *cost += extra_cost->alu.log_shift;
9096
9097           return true;
9098         }
9099       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9100          Handle the second form here taking care that 'a' in the above can
9101          be a shift.  */
9102       else if (GET_CODE (op0) == XOR)
9103         {
9104           rtx newop0 = XEXP (op0, 0);
9105           rtx newop1 = XEXP (op0, 1);
9106           rtx op0_stripped = aarch64_strip_shift (newop0);
9107
9108           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9109           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9110
9111           if (speed)
9112             {
9113               if (op0_stripped != newop0)
9114                 *cost += extra_cost->alu.log_shift;
9115               else
9116                 *cost += extra_cost->alu.logical;
9117             }
9118
9119           return true;
9120         }
9121       /* MVN.  */
9122       if (speed)
9123         *cost += extra_cost->alu.logical;
9124
9125       return false;
9126
9127     case ZERO_EXTEND:
9128
9129       op0 = XEXP (x, 0);
9130       /* If a value is written in SI mode, then zero extended to DI
9131          mode, the operation will in general be free as a write to
9132          a 'w' register implicitly zeroes the upper bits of an 'x'
9133          register.  However, if this is
9134
9135            (set (reg) (zero_extend (reg)))
9136
9137          we must cost the explicit register move.  */
9138       if (mode == DImode
9139           && GET_MODE (op0) == SImode
9140           && outer == SET)
9141         {
9142           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9143
9144         /* If OP_COST is non-zero, then the cost of the zero extend
9145            is effectively the cost of the inner operation.  Otherwise
9146            we have a MOV instruction and we take the cost from the MOV
9147            itself.  This is true independently of whether we are
9148            optimizing for space or time.  */
9149           if (op_cost)
9150             *cost = op_cost;
9151
9152           return true;
9153         }
9154       else if (MEM_P (op0))
9155         {
9156           /* All loads can zero extend to any size for free.  */
9157           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9158           return true;
9159         }
9160
9161       op0 = aarch64_extend_bitfield_pattern_p (x);
9162       if (op0)
9163         {
9164           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9165           if (speed)
9166             *cost += extra_cost->alu.bfx;
9167           return true;
9168         }
9169
9170       if (speed)
9171         {
9172           if (VECTOR_MODE_P (mode))
9173             {
9174               /* UMOV.  */
9175               *cost += extra_cost->vect.alu;
9176             }
9177           else
9178             {
9179               /* We generate an AND instead of UXTB/UXTH.  */
9180               *cost += extra_cost->alu.logical;
9181             }
9182         }
9183       return false;
9184
9185     case SIGN_EXTEND:
9186       if (MEM_P (XEXP (x, 0)))
9187         {
9188           /* LDRSH.  */
9189           if (speed)
9190             {
9191               rtx address = XEXP (XEXP (x, 0), 0);
9192               *cost += extra_cost->ldst.load_sign_extend;
9193
9194               *cost +=
9195                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9196                                                      0, speed));
9197             }
9198           return true;
9199         }
9200
9201       op0 = aarch64_extend_bitfield_pattern_p (x);
9202       if (op0)
9203         {
9204           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9205           if (speed)
9206             *cost += extra_cost->alu.bfx;
9207           return true;
9208         }
9209
9210       if (speed)
9211         {
9212           if (VECTOR_MODE_P (mode))
9213             *cost += extra_cost->vect.alu;
9214           else
9215             *cost += extra_cost->alu.extend;
9216         }
9217       return false;
9218
9219     case ASHIFT:
9220       op0 = XEXP (x, 0);
9221       op1 = XEXP (x, 1);
9222
9223       if (CONST_INT_P (op1))
9224         {
9225           if (speed)
9226             {
9227               if (VECTOR_MODE_P (mode))
9228                 {
9229                   /* Vector shift (immediate).  */
9230                   *cost += extra_cost->vect.alu;
9231                 }
9232               else
9233                 {
9234                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9235                      aliases.  */
9236                   *cost += extra_cost->alu.shift;
9237                 }
9238             }
9239
9240           /* We can incorporate zero/sign extend for free.  */
9241           if (GET_CODE (op0) == ZERO_EXTEND
9242               || GET_CODE (op0) == SIGN_EXTEND)
9243             op0 = XEXP (op0, 0);
9244
9245           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9246           return true;
9247         }
9248       else
9249         {
9250           if (VECTOR_MODE_P (mode))
9251             {
9252               if (speed)
9253                 /* Vector shift (register).  */
9254                 *cost += extra_cost->vect.alu;
9255             }
9256           else
9257             {
9258               if (speed)
9259                 /* LSLV.  */
9260                 *cost += extra_cost->alu.shift_reg;
9261
9262               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9263                   && CONST_INT_P (XEXP (op1, 1))
9264                   && known_eq (INTVAL (XEXP (op1, 1)),
9265                                GET_MODE_BITSIZE (mode) - 1))
9266                 {
9267                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9268                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9269                      don't recurse into it.  */
9270                   return true;
9271                 }
9272             }
9273           return false;  /* All arguments need to be in registers.  */
9274         }
9275
9276     case ROTATE:
9277     case ROTATERT:
9278     case LSHIFTRT:
9279     case ASHIFTRT:
9280       op0 = XEXP (x, 0);
9281       op1 = XEXP (x, 1);
9282
9283       if (CONST_INT_P (op1))
9284         {
9285           /* ASR (immediate) and friends.  */
9286           if (speed)
9287             {
9288               if (VECTOR_MODE_P (mode))
9289                 *cost += extra_cost->vect.alu;
9290               else
9291                 *cost += extra_cost->alu.shift;
9292             }
9293
9294           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9295           return true;
9296         }
9297       else
9298         {
9299           if (VECTOR_MODE_P (mode))
9300             {
9301               if (speed)
9302                 /* Vector shift (register).  */
9303                 *cost += extra_cost->vect.alu;
9304             }
9305           else
9306             {
9307               if (speed)
9308                 /* ASR (register) and friends.  */
9309                 *cost += extra_cost->alu.shift_reg;
9310
9311               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9312                   && CONST_INT_P (XEXP (op1, 1))
9313                   && known_eq (INTVAL (XEXP (op1, 1)),
9314                                GET_MODE_BITSIZE (mode) - 1))
9315                 {
9316                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9317                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9318                      don't recurse into it.  */
9319                   return true;
9320                 }
9321             }
9322           return false;  /* All arguments need to be in registers.  */
9323         }
9324
9325     case SYMBOL_REF:
9326
9327       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9328           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9329         {
9330           /* LDR.  */
9331           if (speed)
9332             *cost += extra_cost->ldst.load;
9333         }
9334       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9335                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9336         {
9337           /* ADRP, followed by ADD.  */
9338           *cost += COSTS_N_INSNS (1);
9339           if (speed)
9340             *cost += 2 * extra_cost->alu.arith;
9341         }
9342       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9343                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9344         {
9345           /* ADR.  */
9346           if (speed)
9347             *cost += extra_cost->alu.arith;
9348         }
9349
9350       if (flag_pic)
9351         {
9352           /* One extra load instruction, after accessing the GOT.  */
9353           *cost += COSTS_N_INSNS (1);
9354           if (speed)
9355             *cost += extra_cost->ldst.load;
9356         }
9357       return true;
9358
9359     case HIGH:
9360     case LO_SUM:
9361       /* ADRP/ADD (immediate).  */
9362       if (speed)
9363         *cost += extra_cost->alu.arith;
9364       return true;
9365
9366     case ZERO_EXTRACT:
9367     case SIGN_EXTRACT:
9368       /* UBFX/SBFX.  */
9369       if (speed)
9370         {
9371           if (VECTOR_MODE_P (mode))
9372             *cost += extra_cost->vect.alu;
9373           else
9374             *cost += extra_cost->alu.bfx;
9375         }
9376
9377       /* We can trust that the immediates used will be correct (there
9378          are no by-register forms), so we need only cost op0.  */
9379       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9380       return true;
9381
9382     case MULT:
9383       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9384       /* aarch64_rtx_mult_cost always handles recursion to its
9385          operands.  */
9386       return true;
9387
9388     case MOD:
9389     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9390        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9391        an unconditional negate.  This case should only ever be reached through
9392        the set_smod_pow2_cheap check in expmed.c.  */
9393       if (CONST_INT_P (XEXP (x, 1))
9394           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9395           && (mode == SImode || mode == DImode))
9396         {
9397           /* We expand to 4 instructions.  Reset the baseline.  */
9398           *cost = COSTS_N_INSNS (4);
9399
9400           if (speed)
9401             *cost += 2 * extra_cost->alu.logical
9402                      + 2 * extra_cost->alu.arith;
9403
9404           return true;
9405         }
9406
9407     /* Fall-through.  */
9408     case UMOD:
9409       if (speed)
9410         {
9411           /* Slighly prefer UMOD over SMOD.  */
9412           if (VECTOR_MODE_P (mode))
9413             *cost += extra_cost->vect.alu;
9414           else if (GET_MODE_CLASS (mode) == MODE_INT)
9415             *cost += (extra_cost->mult[mode == DImode].add
9416                       + extra_cost->mult[mode == DImode].idiv
9417                       + (code == MOD ? 1 : 0));
9418         }
9419       return false;  /* All arguments need to be in registers.  */
9420
9421     case DIV:
9422     case UDIV:
9423     case SQRT:
9424       if (speed)
9425         {
9426           if (VECTOR_MODE_P (mode))
9427             *cost += extra_cost->vect.alu;
9428           else if (GET_MODE_CLASS (mode) == MODE_INT)
9429             /* There is no integer SQRT, so only DIV and UDIV can get
9430                here.  */
9431             *cost += (extra_cost->mult[mode == DImode].idiv
9432                      /* Slighly prefer UDIV over SDIV.  */
9433                      + (code == DIV ? 1 : 0));
9434           else
9435             *cost += extra_cost->fp[mode == DFmode].div;
9436         }
9437       return false;  /* All arguments need to be in registers.  */
9438
9439     case IF_THEN_ELSE:
9440       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9441                                          XEXP (x, 2), cost, speed);
9442
9443     case EQ:
9444     case NE:
9445     case GT:
9446     case GTU:
9447     case LT:
9448     case LTU:
9449     case GE:
9450     case GEU:
9451     case LE:
9452     case LEU:
9453
9454       return false; /* All arguments must be in registers.  */
9455
9456     case FMA:
9457       op0 = XEXP (x, 0);
9458       op1 = XEXP (x, 1);
9459       op2 = XEXP (x, 2);
9460
9461       if (speed)
9462         {
9463           if (VECTOR_MODE_P (mode))
9464             *cost += extra_cost->vect.alu;
9465           else
9466             *cost += extra_cost->fp[mode == DFmode].fma;
9467         }
9468
9469       /* FMSUB, FNMADD, and FNMSUB are free.  */
9470       if (GET_CODE (op0) == NEG)
9471         op0 = XEXP (op0, 0);
9472
9473       if (GET_CODE (op2) == NEG)
9474         op2 = XEXP (op2, 0);
9475
9476       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9477          and the by-element operand as operand 0.  */
9478       if (GET_CODE (op1) == NEG)
9479         op1 = XEXP (op1, 0);
9480
9481       /* Catch vector-by-element operations.  The by-element operand can
9482          either be (vec_duplicate (vec_select (x))) or just
9483          (vec_select (x)), depending on whether we are multiplying by
9484          a vector or a scalar.
9485
9486          Canonicalization is not very good in these cases, FMA4 will put the
9487          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9488       if (GET_CODE (op0) == VEC_DUPLICATE)
9489         op0 = XEXP (op0, 0);
9490       else if (GET_CODE (op1) == VEC_DUPLICATE)
9491         op1 = XEXP (op1, 0);
9492
9493       if (GET_CODE (op0) == VEC_SELECT)
9494         op0 = XEXP (op0, 0);
9495       else if (GET_CODE (op1) == VEC_SELECT)
9496         op1 = XEXP (op1, 0);
9497
9498       /* If the remaining parameters are not registers,
9499          get the cost to put them into registers.  */
9500       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9501       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9502       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9503       return true;
9504
9505     case FLOAT:
9506     case UNSIGNED_FLOAT:
9507       if (speed)
9508         *cost += extra_cost->fp[mode == DFmode].fromint;
9509       return false;
9510
9511     case FLOAT_EXTEND:
9512       if (speed)
9513         {
9514           if (VECTOR_MODE_P (mode))
9515             {
9516               /*Vector truncate.  */
9517               *cost += extra_cost->vect.alu;
9518             }
9519           else
9520             *cost += extra_cost->fp[mode == DFmode].widen;
9521         }
9522       return false;
9523
9524     case FLOAT_TRUNCATE:
9525       if (speed)
9526         {
9527           if (VECTOR_MODE_P (mode))
9528             {
9529               /*Vector conversion.  */
9530               *cost += extra_cost->vect.alu;
9531             }
9532           else
9533             *cost += extra_cost->fp[mode == DFmode].narrow;
9534         }
9535       return false;
9536
9537     case FIX:
9538     case UNSIGNED_FIX:
9539       x = XEXP (x, 0);
9540       /* Strip the rounding part.  They will all be implemented
9541          by the fcvt* family of instructions anyway.  */
9542       if (GET_CODE (x) == UNSPEC)
9543         {
9544           unsigned int uns_code = XINT (x, 1);
9545
9546           if (uns_code == UNSPEC_FRINTA
9547               || uns_code == UNSPEC_FRINTM
9548               || uns_code == UNSPEC_FRINTN
9549               || uns_code == UNSPEC_FRINTP
9550               || uns_code == UNSPEC_FRINTZ)
9551             x = XVECEXP (x, 0, 0);
9552         }
9553
9554       if (speed)
9555         {
9556           if (VECTOR_MODE_P (mode))
9557             *cost += extra_cost->vect.alu;
9558           else
9559             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9560         }
9561
9562       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9563          fixed-point fcvt.  */
9564       if (GET_CODE (x) == MULT
9565           && ((VECTOR_MODE_P (mode)
9566                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9567               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9568         {
9569           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9570                              0, speed);
9571           return true;
9572         }
9573
9574       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9575       return true;
9576
9577     case ABS:
9578       if (VECTOR_MODE_P (mode))
9579         {
9580           /* ABS (vector).  */
9581           if (speed)
9582             *cost += extra_cost->vect.alu;
9583         }
9584       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9585         {
9586           op0 = XEXP (x, 0);
9587
9588           /* FABD, which is analogous to FADD.  */
9589           if (GET_CODE (op0) == MINUS)
9590             {
9591               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9592               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9593               if (speed)
9594                 *cost += extra_cost->fp[mode == DFmode].addsub;
9595
9596               return true;
9597             }
9598           /* Simple FABS is analogous to FNEG.  */
9599           if (speed)
9600             *cost += extra_cost->fp[mode == DFmode].neg;
9601         }
9602       else
9603         {
9604           /* Integer ABS will either be split to
9605              two arithmetic instructions, or will be an ABS
9606              (scalar), which we don't model.  */
9607           *cost = COSTS_N_INSNS (2);
9608           if (speed)
9609             *cost += 2 * extra_cost->alu.arith;
9610         }
9611       return false;
9612
9613     case SMAX:
9614     case SMIN:
9615       if (speed)
9616         {
9617           if (VECTOR_MODE_P (mode))
9618             *cost += extra_cost->vect.alu;
9619           else
9620             {
9621               /* FMAXNM/FMINNM/FMAX/FMIN.
9622                  TODO: This may not be accurate for all implementations, but
9623                  we do not model this in the cost tables.  */
9624               *cost += extra_cost->fp[mode == DFmode].addsub;
9625             }
9626         }
9627       return false;
9628
9629     case UNSPEC:
9630       /* The floating point round to integer frint* instructions.  */
9631       if (aarch64_frint_unspec_p (XINT (x, 1)))
9632         {
9633           if (speed)
9634             *cost += extra_cost->fp[mode == DFmode].roundint;
9635
9636           return false;
9637         }
9638
9639       if (XINT (x, 1) == UNSPEC_RBIT)
9640         {
9641           if (speed)
9642             *cost += extra_cost->alu.rev;
9643
9644           return false;
9645         }
9646       break;
9647
9648     case TRUNCATE:
9649
9650       /* Decompose <su>muldi3_highpart.  */
9651       if (/* (truncate:DI  */
9652           mode == DImode
9653           /*   (lshiftrt:TI  */
9654           && GET_MODE (XEXP (x, 0)) == TImode
9655           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9656           /*      (mult:TI  */
9657           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9658           /*        (ANY_EXTEND:TI (reg:DI))
9659                     (ANY_EXTEND:TI (reg:DI)))  */
9660           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9661                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9662               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9663                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9664           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9665           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9666           /*     (const_int 64)  */
9667           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9668           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9669         {
9670           /* UMULH/SMULH.  */
9671           if (speed)
9672             *cost += extra_cost->mult[mode == DImode].extend;
9673           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9674                              mode, MULT, 0, speed);
9675           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9676                              mode, MULT, 1, speed);
9677           return true;
9678         }
9679
9680       /* Fall through.  */
9681     default:
9682       break;
9683     }
9684
9685   if (dump_file
9686       && flag_aarch64_verbose_cost)
9687     fprintf (dump_file,
9688       "\nFailed to cost RTX.  Assuming default cost.\n");
9689
9690   return true;
9691 }
9692
9693 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9694    calculated for X.  This cost is stored in *COST.  Returns true
9695    if the total cost of X was calculated.  */
9696 static bool
9697 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9698                    int param, int *cost, bool speed)
9699 {
9700   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9701
9702   if (dump_file
9703       && flag_aarch64_verbose_cost)
9704     {
9705       print_rtl_single (dump_file, x);
9706       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9707                speed ? "Hot" : "Cold",
9708                *cost, result ? "final" : "partial");
9709     }
9710
9711   return result;
9712 }
9713
9714 static int
9715 aarch64_register_move_cost (machine_mode mode,
9716                             reg_class_t from_i, reg_class_t to_i)
9717 {
9718   enum reg_class from = (enum reg_class) from_i;
9719   enum reg_class to = (enum reg_class) to_i;
9720   const struct cpu_regmove_cost *regmove_cost
9721     = aarch64_tune_params.regmove_cost;
9722
9723   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9724   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9725     to = GENERAL_REGS;
9726
9727   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9728     from = GENERAL_REGS;
9729
9730   /* Moving between GPR and stack cost is the same as GP2GP.  */
9731   if ((from == GENERAL_REGS && to == STACK_REG)
9732       || (to == GENERAL_REGS && from == STACK_REG))
9733     return regmove_cost->GP2GP;
9734
9735   /* To/From the stack register, we move via the gprs.  */
9736   if (to == STACK_REG || from == STACK_REG)
9737     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9738             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9739
9740   if (known_eq (GET_MODE_SIZE (mode), 16))
9741     {
9742       /* 128-bit operations on general registers require 2 instructions.  */
9743       if (from == GENERAL_REGS && to == GENERAL_REGS)
9744         return regmove_cost->GP2GP * 2;
9745       else if (from == GENERAL_REGS)
9746         return regmove_cost->GP2FP * 2;
9747       else if (to == GENERAL_REGS)
9748         return regmove_cost->FP2GP * 2;
9749
9750       /* When AdvSIMD instructions are disabled it is not possible to move
9751          a 128-bit value directly between Q registers.  This is handled in
9752          secondary reload.  A general register is used as a scratch to move
9753          the upper DI value and the lower DI value is moved directly,
9754          hence the cost is the sum of three moves. */
9755       if (! TARGET_SIMD)
9756         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9757
9758       return regmove_cost->FP2FP;
9759     }
9760
9761   if (from == GENERAL_REGS && to == GENERAL_REGS)
9762     return regmove_cost->GP2GP;
9763   else if (from == GENERAL_REGS)
9764     return regmove_cost->GP2FP;
9765   else if (to == GENERAL_REGS)
9766     return regmove_cost->FP2GP;
9767
9768   return regmove_cost->FP2FP;
9769 }
9770
9771 static int
9772 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9773                           reg_class_t rclass ATTRIBUTE_UNUSED,
9774                           bool in ATTRIBUTE_UNUSED)
9775 {
9776   return aarch64_tune_params.memmov_cost;
9777 }
9778
9779 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9780    to optimize 1.0/sqrt.  */
9781
9782 static bool
9783 use_rsqrt_p (machine_mode mode)
9784 {
9785   return (!flag_trapping_math
9786           && flag_unsafe_math_optimizations
9787           && ((aarch64_tune_params.approx_modes->recip_sqrt
9788                & AARCH64_APPROX_MODE (mode))
9789               || flag_mrecip_low_precision_sqrt));
9790 }
9791
9792 /* Function to decide when to use the approximate reciprocal square root
9793    builtin.  */
9794
9795 static tree
9796 aarch64_builtin_reciprocal (tree fndecl)
9797 {
9798   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9799
9800   if (!use_rsqrt_p (mode))
9801     return NULL_TREE;
9802   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9803 }
9804
9805 typedef rtx (*rsqrte_type) (rtx, rtx);
9806
9807 /* Select reciprocal square root initial estimate insn depending on machine
9808    mode.  */
9809
9810 static rsqrte_type
9811 get_rsqrte_type (machine_mode mode)
9812 {
9813   switch (mode)
9814   {
9815     case E_DFmode:   return gen_aarch64_rsqrtedf;
9816     case E_SFmode:   return gen_aarch64_rsqrtesf;
9817     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9818     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9819     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9820     default: gcc_unreachable ();
9821   }
9822 }
9823
9824 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9825
9826 /* Select reciprocal square root series step insn depending on machine mode.  */
9827
9828 static rsqrts_type
9829 get_rsqrts_type (machine_mode mode)
9830 {
9831   switch (mode)
9832   {
9833     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9834     case E_SFmode:   return gen_aarch64_rsqrtssf;
9835     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9836     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9837     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9838     default: gcc_unreachable ();
9839   }
9840 }
9841
9842 /* Emit instruction sequence to compute either the approximate square root
9843    or its approximate reciprocal, depending on the flag RECP, and return
9844    whether the sequence was emitted or not.  */
9845
9846 bool
9847 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9848 {
9849   machine_mode mode = GET_MODE (dst);
9850
9851   if (GET_MODE_INNER (mode) == HFmode)
9852     {
9853       gcc_assert (!recp);
9854       return false;
9855     }
9856
9857   if (!recp)
9858     {
9859       if (!(flag_mlow_precision_sqrt
9860             || (aarch64_tune_params.approx_modes->sqrt
9861                 & AARCH64_APPROX_MODE (mode))))
9862         return false;
9863
9864       if (flag_finite_math_only
9865           || flag_trapping_math
9866           || !flag_unsafe_math_optimizations
9867           || optimize_function_for_size_p (cfun))
9868         return false;
9869     }
9870   else
9871     /* Caller assumes we cannot fail.  */
9872     gcc_assert (use_rsqrt_p (mode));
9873
9874   machine_mode mmsk = mode_for_int_vector (mode).require ();
9875   rtx xmsk = gen_reg_rtx (mmsk);
9876   if (!recp)
9877     /* When calculating the approximate square root, compare the
9878        argument with 0.0 and create a mask.  */
9879     emit_insn (gen_rtx_SET (xmsk,
9880                             gen_rtx_NEG (mmsk,
9881                                          gen_rtx_EQ (mmsk, src,
9882                                                      CONST0_RTX (mode)))));
9883
9884   /* Estimate the approximate reciprocal square root.  */
9885   rtx xdst = gen_reg_rtx (mode);
9886   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9887
9888   /* Iterate over the series twice for SF and thrice for DF.  */
9889   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9890
9891   /* Optionally iterate over the series once less for faster performance
9892      while sacrificing the accuracy.  */
9893   if ((recp && flag_mrecip_low_precision_sqrt)
9894       || (!recp && flag_mlow_precision_sqrt))
9895     iterations--;
9896
9897   /* Iterate over the series to calculate the approximate reciprocal square
9898      root.  */
9899   rtx x1 = gen_reg_rtx (mode);
9900   while (iterations--)
9901     {
9902       rtx x2 = gen_reg_rtx (mode);
9903       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9904
9905       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9906
9907       if (iterations > 0)
9908         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9909     }
9910
9911   if (!recp)
9912     {
9913       /* Qualify the approximate reciprocal square root when the argument is
9914          0.0 by squashing the intermediary result to 0.0.  */
9915       rtx xtmp = gen_reg_rtx (mmsk);
9916       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9917                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9918       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9919
9920       /* Calculate the approximate square root.  */
9921       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9922     }
9923
9924   /* Finalize the approximation.  */
9925   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9926
9927   return true;
9928 }
9929
9930 typedef rtx (*recpe_type) (rtx, rtx);
9931
9932 /* Select reciprocal initial estimate insn depending on machine mode.  */
9933
9934 static recpe_type
9935 get_recpe_type (machine_mode mode)
9936 {
9937   switch (mode)
9938   {
9939     case E_SFmode:   return (gen_aarch64_frecpesf);
9940     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9941     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9942     case E_DFmode:   return (gen_aarch64_frecpedf);
9943     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9944     default:         gcc_unreachable ();
9945   }
9946 }
9947
9948 typedef rtx (*recps_type) (rtx, rtx, rtx);
9949
9950 /* Select reciprocal series step insn depending on machine mode.  */
9951
9952 static recps_type
9953 get_recps_type (machine_mode mode)
9954 {
9955   switch (mode)
9956   {
9957     case E_SFmode:   return (gen_aarch64_frecpssf);
9958     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9959     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9960     case E_DFmode:   return (gen_aarch64_frecpsdf);
9961     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9962     default:         gcc_unreachable ();
9963   }
9964 }
9965
9966 /* Emit the instruction sequence to compute the approximation for the division
9967    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9968
9969 bool
9970 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9971 {
9972   machine_mode mode = GET_MODE (quo);
9973
9974   if (GET_MODE_INNER (mode) == HFmode)
9975     return false;
9976
9977   bool use_approx_division_p = (flag_mlow_precision_div
9978                                 || (aarch64_tune_params.approx_modes->division
9979                                     & AARCH64_APPROX_MODE (mode)));
9980
9981   if (!flag_finite_math_only
9982       || flag_trapping_math
9983       || !flag_unsafe_math_optimizations
9984       || optimize_function_for_size_p (cfun)
9985       || !use_approx_division_p)
9986     return false;
9987
9988   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9989     return false;
9990
9991   /* Estimate the approximate reciprocal.  */
9992   rtx xrcp = gen_reg_rtx (mode);
9993   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9994
9995   /* Iterate over the series twice for SF and thrice for DF.  */
9996   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9997
9998   /* Optionally iterate over the series once less for faster performance,
9999      while sacrificing the accuracy.  */
10000   if (flag_mlow_precision_div)
10001     iterations--;
10002
10003   /* Iterate over the series to calculate the approximate reciprocal.  */
10004   rtx xtmp = gen_reg_rtx (mode);
10005   while (iterations--)
10006     {
10007       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10008
10009       if (iterations > 0)
10010         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10011     }
10012
10013   if (num != CONST1_RTX (mode))
10014     {
10015       /* As the approximate reciprocal of DEN is already calculated, only
10016          calculate the approximate division when NUM is not 1.0.  */
10017       rtx xnum = force_reg (mode, num);
10018       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10019     }
10020
10021   /* Finalize the approximation.  */
10022   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10023   return true;
10024 }
10025
10026 /* Return the number of instructions that can be issued per cycle.  */
10027 static int
10028 aarch64_sched_issue_rate (void)
10029 {
10030   return aarch64_tune_params.issue_rate;
10031 }
10032
10033 static int
10034 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10035 {
10036   int issue_rate = aarch64_sched_issue_rate ();
10037
10038   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10039 }
10040
10041
10042 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10043    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10044    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10045
10046 static int
10047 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10048                                                     int ready_index)
10049 {
10050   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10051 }
10052
10053
10054 /* Vectorizer cost model target hooks.  */
10055
10056 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10057 static int
10058 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10059                                     tree vectype,
10060                                     int misalign ATTRIBUTE_UNUSED)
10061 {
10062   unsigned elements;
10063   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10064   bool fp = false;
10065
10066   if (vectype != NULL)
10067     fp = FLOAT_TYPE_P (vectype);
10068
10069   switch (type_of_cost)
10070     {
10071       case scalar_stmt:
10072         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10073
10074       case scalar_load:
10075         return costs->scalar_load_cost;
10076
10077       case scalar_store:
10078         return costs->scalar_store_cost;
10079
10080       case vector_stmt:
10081         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10082
10083       case vector_load:
10084         return costs->vec_align_load_cost;
10085
10086       case vector_store:
10087         return costs->vec_store_cost;
10088
10089       case vec_to_scalar:
10090         return costs->vec_to_scalar_cost;
10091
10092       case scalar_to_vec:
10093         return costs->scalar_to_vec_cost;
10094
10095       case unaligned_load:
10096       case vector_gather_load:
10097         return costs->vec_unalign_load_cost;
10098
10099       case unaligned_store:
10100       case vector_scatter_store:
10101         return costs->vec_unalign_store_cost;
10102
10103       case cond_branch_taken:
10104         return costs->cond_taken_branch_cost;
10105
10106       case cond_branch_not_taken:
10107         return costs->cond_not_taken_branch_cost;
10108
10109       case vec_perm:
10110         return costs->vec_permute_cost;
10111
10112       case vec_promote_demote:
10113         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10114
10115       case vec_construct:
10116         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10117         return elements / 2 + 1;
10118
10119       default:
10120         gcc_unreachable ();
10121     }
10122 }
10123
10124 /* Implement targetm.vectorize.add_stmt_cost.  */
10125 static unsigned
10126 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10127                        struct _stmt_vec_info *stmt_info, int misalign,
10128                        enum vect_cost_model_location where)
10129 {
10130   unsigned *cost = (unsigned *) data;
10131   unsigned retval = 0;
10132
10133   if (flag_vect_cost_model)
10134     {
10135       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10136       int stmt_cost =
10137             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10138
10139       /* Statements in an inner loop relative to the loop being
10140          vectorized are weighted more heavily.  The value here is
10141          arbitrary and could potentially be improved with analysis.  */
10142       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10143         count *= 50; /*  FIXME  */
10144
10145       retval = (unsigned) (count * stmt_cost);
10146       cost[where] += retval;
10147     }
10148
10149   return retval;
10150 }
10151
10152 static void initialize_aarch64_code_model (struct gcc_options *);
10153
10154 /* Parse the TO_PARSE string and put the architecture struct that it
10155    selects into RES and the architectural features into ISA_FLAGS.
10156    Return an aarch64_parse_opt_result describing the parse result.
10157    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10158
10159 static enum aarch64_parse_opt_result
10160 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10161                     unsigned long *isa_flags)
10162 {
10163   char *ext;
10164   const struct processor *arch;
10165   char *str = (char *) alloca (strlen (to_parse) + 1);
10166   size_t len;
10167
10168   strcpy (str, to_parse);
10169
10170   ext = strchr (str, '+');
10171
10172   if (ext != NULL)
10173     len = ext - str;
10174   else
10175     len = strlen (str);
10176
10177   if (len == 0)
10178     return AARCH64_PARSE_MISSING_ARG;
10179
10180
10181   /* Loop through the list of supported ARCHes to find a match.  */
10182   for (arch = all_architectures; arch->name != NULL; arch++)
10183     {
10184       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10185         {
10186           unsigned long isa_temp = arch->flags;
10187
10188           if (ext != NULL)
10189             {
10190               /* TO_PARSE string contains at least one extension.  */
10191               enum aarch64_parse_opt_result ext_res
10192                 = aarch64_parse_extension (ext, &isa_temp);
10193
10194               if (ext_res != AARCH64_PARSE_OK)
10195                 return ext_res;
10196             }
10197           /* Extension parsing was successful.  Confirm the result
10198              arch and ISA flags.  */
10199           *res = arch;
10200           *isa_flags = isa_temp;
10201           return AARCH64_PARSE_OK;
10202         }
10203     }
10204
10205   /* ARCH name not found in list.  */
10206   return AARCH64_PARSE_INVALID_ARG;
10207 }
10208
10209 /* Parse the TO_PARSE string and put the result tuning in RES and the
10210    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10211    describing the parse result.  If there is an error parsing, RES and
10212    ISA_FLAGS are left unchanged.  */
10213
10214 static enum aarch64_parse_opt_result
10215 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10216                    unsigned long *isa_flags)
10217 {
10218   char *ext;
10219   const struct processor *cpu;
10220   char *str = (char *) alloca (strlen (to_parse) + 1);
10221   size_t len;
10222
10223   strcpy (str, to_parse);
10224
10225   ext = strchr (str, '+');
10226
10227   if (ext != NULL)
10228     len = ext - str;
10229   else
10230     len = strlen (str);
10231
10232   if (len == 0)
10233     return AARCH64_PARSE_MISSING_ARG;
10234
10235
10236   /* Loop through the list of supported CPUs to find a match.  */
10237   for (cpu = all_cores; cpu->name != NULL; cpu++)
10238     {
10239       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10240         {
10241           unsigned long isa_temp = cpu->flags;
10242
10243
10244           if (ext != NULL)
10245             {
10246               /* TO_PARSE string contains at least one extension.  */
10247               enum aarch64_parse_opt_result ext_res
10248                 = aarch64_parse_extension (ext, &isa_temp);
10249
10250               if (ext_res != AARCH64_PARSE_OK)
10251                 return ext_res;
10252             }
10253           /* Extension parsing was successfull.  Confirm the result
10254              cpu and ISA flags.  */
10255           *res = cpu;
10256           *isa_flags = isa_temp;
10257           return AARCH64_PARSE_OK;
10258         }
10259     }
10260
10261   /* CPU name not found in list.  */
10262   return AARCH64_PARSE_INVALID_ARG;
10263 }
10264
10265 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10266    Return an aarch64_parse_opt_result describing the parse result.
10267    If the parsing fails the RES does not change.  */
10268
10269 static enum aarch64_parse_opt_result
10270 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10271 {
10272   const struct processor *cpu;
10273   char *str = (char *) alloca (strlen (to_parse) + 1);
10274
10275   strcpy (str, to_parse);
10276
10277   /* Loop through the list of supported CPUs to find a match.  */
10278   for (cpu = all_cores; cpu->name != NULL; cpu++)
10279     {
10280       if (strcmp (cpu->name, str) == 0)
10281         {
10282           *res = cpu;
10283           return AARCH64_PARSE_OK;
10284         }
10285     }
10286
10287   /* CPU name not found in list.  */
10288   return AARCH64_PARSE_INVALID_ARG;
10289 }
10290
10291 /* Parse TOKEN, which has length LENGTH to see if it is an option
10292    described in FLAG.  If it is, return the index bit for that fusion type.
10293    If not, error (printing OPTION_NAME) and return zero.  */
10294
10295 static unsigned int
10296 aarch64_parse_one_option_token (const char *token,
10297                                 size_t length,
10298                                 const struct aarch64_flag_desc *flag,
10299                                 const char *option_name)
10300 {
10301   for (; flag->name != NULL; flag++)
10302     {
10303       if (length == strlen (flag->name)
10304           && !strncmp (flag->name, token, length))
10305         return flag->flag;
10306     }
10307
10308   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10309   return 0;
10310 }
10311
10312 /* Parse OPTION which is a comma-separated list of flags to enable.
10313    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10314    default state we inherit from the CPU tuning structures.  OPTION_NAME
10315    gives the top-level option we are parsing in the -moverride string,
10316    for use in error messages.  */
10317
10318 static unsigned int
10319 aarch64_parse_boolean_options (const char *option,
10320                                const struct aarch64_flag_desc *flags,
10321                                unsigned int initial_state,
10322                                const char *option_name)
10323 {
10324   const char separator = '.';
10325   const char* specs = option;
10326   const char* ntoken = option;
10327   unsigned int found_flags = initial_state;
10328
10329   while ((ntoken = strchr (specs, separator)))
10330     {
10331       size_t token_length = ntoken - specs;
10332       unsigned token_ops = aarch64_parse_one_option_token (specs,
10333                                                            token_length,
10334                                                            flags,
10335                                                            option_name);
10336       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10337          in the token stream, reset the supported operations.  So:
10338
10339            adrp+add.cmp+branch.none.adrp+add
10340
10341            would have the result of turning on only adrp+add fusion.  */
10342       if (!token_ops)
10343         found_flags = 0;
10344
10345       found_flags |= token_ops;
10346       specs = ++ntoken;
10347     }
10348
10349   /* We ended with a comma, print something.  */
10350   if (!(*specs))
10351     {
10352       error ("%s string ill-formed\n", option_name);
10353       return 0;
10354     }
10355
10356   /* We still have one more token to parse.  */
10357   size_t token_length = strlen (specs);
10358   unsigned token_ops = aarch64_parse_one_option_token (specs,
10359                                                        token_length,
10360                                                        flags,
10361                                                        option_name);
10362    if (!token_ops)
10363      found_flags = 0;
10364
10365   found_flags |= token_ops;
10366   return found_flags;
10367 }
10368
10369 /* Support for overriding instruction fusion.  */
10370
10371 static void
10372 aarch64_parse_fuse_string (const char *fuse_string,
10373                             struct tune_params *tune)
10374 {
10375   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10376                                                      aarch64_fusible_pairs,
10377                                                      tune->fusible_ops,
10378                                                      "fuse=");
10379 }
10380
10381 /* Support for overriding other tuning flags.  */
10382
10383 static void
10384 aarch64_parse_tune_string (const char *tune_string,
10385                             struct tune_params *tune)
10386 {
10387   tune->extra_tuning_flags
10388     = aarch64_parse_boolean_options (tune_string,
10389                                      aarch64_tuning_flags,
10390                                      tune->extra_tuning_flags,
10391                                      "tune=");
10392 }
10393
10394 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10395    we understand.  If it is, extract the option string and handoff to
10396    the appropriate function.  */
10397
10398 void
10399 aarch64_parse_one_override_token (const char* token,
10400                                   size_t length,
10401                                   struct tune_params *tune)
10402 {
10403   const struct aarch64_tuning_override_function *fn
10404     = aarch64_tuning_override_functions;
10405
10406   const char *option_part = strchr (token, '=');
10407   if (!option_part)
10408     {
10409       error ("tuning string missing in option (%s)", token);
10410       return;
10411     }
10412
10413   /* Get the length of the option name.  */
10414   length = option_part - token;
10415   /* Skip the '=' to get to the option string.  */
10416   option_part++;
10417
10418   for (; fn->name != NULL; fn++)
10419     {
10420       if (!strncmp (fn->name, token, length))
10421         {
10422           fn->parse_override (option_part, tune);
10423           return;
10424         }
10425     }
10426
10427   error ("unknown tuning option (%s)",token);
10428   return;
10429 }
10430
10431 /* A checking mechanism for the implementation of the tls size.  */
10432
10433 static void
10434 initialize_aarch64_tls_size (struct gcc_options *opts)
10435 {
10436   if (aarch64_tls_size == 0)
10437     aarch64_tls_size = 24;
10438
10439   switch (opts->x_aarch64_cmodel_var)
10440     {
10441     case AARCH64_CMODEL_TINY:
10442       /* Both the default and maximum TLS size allowed under tiny is 1M which
10443          needs two instructions to address, so we clamp the size to 24.  */
10444       if (aarch64_tls_size > 24)
10445         aarch64_tls_size = 24;
10446       break;
10447     case AARCH64_CMODEL_SMALL:
10448       /* The maximum TLS size allowed under small is 4G.  */
10449       if (aarch64_tls_size > 32)
10450         aarch64_tls_size = 32;
10451       break;
10452     case AARCH64_CMODEL_LARGE:
10453       /* The maximum TLS size allowed under large is 16E.
10454          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10455       if (aarch64_tls_size > 48)
10456         aarch64_tls_size = 48;
10457       break;
10458     default:
10459       gcc_unreachable ();
10460     }
10461
10462   return;
10463 }
10464
10465 /* Parse STRING looking for options in the format:
10466      string     :: option:string
10467      option     :: name=substring
10468      name       :: {a-z}
10469      substring  :: defined by option.  */
10470
10471 static void
10472 aarch64_parse_override_string (const char* input_string,
10473                                struct tune_params* tune)
10474 {
10475   const char separator = ':';
10476   size_t string_length = strlen (input_string) + 1;
10477   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10478   char *string = string_root;
10479   strncpy (string, input_string, string_length);
10480   string[string_length - 1] = '\0';
10481
10482   char* ntoken = string;
10483
10484   while ((ntoken = strchr (string, separator)))
10485     {
10486       size_t token_length = ntoken - string;
10487       /* Make this substring look like a string.  */
10488       *ntoken = '\0';
10489       aarch64_parse_one_override_token (string, token_length, tune);
10490       string = ++ntoken;
10491     }
10492
10493   /* One last option to parse.  */
10494   aarch64_parse_one_override_token (string, strlen (string), tune);
10495   free (string_root);
10496 }
10497
10498
10499 static void
10500 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10501 {
10502   /* PR 70044: We have to be careful about being called multiple times for the
10503      same function.  This means all changes should be repeatable.  */
10504
10505   /* If the frame pointer is enabled, set it to a special value that behaves
10506      similar to frame pointer omission.  If we don't do this all leaf functions
10507      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10508      If flag_omit_frame_pointer has this special value, we must force the
10509      frame pointer if not in a leaf function.  We also need to force it in a
10510      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
10511   if (opts->x_flag_omit_frame_pointer == 0)
10512     opts->x_flag_omit_frame_pointer = 2;
10513
10514   /* If not optimizing for size, set the default
10515      alignment to what the target wants.  */
10516   if (!opts->x_optimize_size)
10517     {
10518       if (opts->x_align_loops <= 0)
10519         opts->x_align_loops = aarch64_tune_params.loop_align;
10520       if (opts->x_align_jumps <= 0)
10521         opts->x_align_jumps = aarch64_tune_params.jump_align;
10522       if (opts->x_align_functions <= 0)
10523         opts->x_align_functions = aarch64_tune_params.function_align;
10524     }
10525
10526   /* We default to no pc-relative literal loads.  */
10527
10528   aarch64_pcrelative_literal_loads = false;
10529
10530   /* If -mpc-relative-literal-loads is set on the command line, this
10531      implies that the user asked for PC relative literal loads.  */
10532   if (opts->x_pcrelative_literal_loads == 1)
10533     aarch64_pcrelative_literal_loads = true;
10534
10535   /* In the tiny memory model it makes no sense to disallow PC relative
10536      literal pool loads.  */
10537   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10538       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10539     aarch64_pcrelative_literal_loads = true;
10540
10541   /* When enabling the lower precision Newton series for the square root, also
10542      enable it for the reciprocal square root, since the latter is an
10543      intermediary step for the former.  */
10544   if (flag_mlow_precision_sqrt)
10545     flag_mrecip_low_precision_sqrt = true;
10546 }
10547
10548 /* 'Unpack' up the internal tuning structs and update the options
10549     in OPTS.  The caller must have set up selected_tune and selected_arch
10550     as all the other target-specific codegen decisions are
10551     derived from them.  */
10552
10553 void
10554 aarch64_override_options_internal (struct gcc_options *opts)
10555 {
10556   aarch64_tune_flags = selected_tune->flags;
10557   aarch64_tune = selected_tune->sched_core;
10558   /* Make a copy of the tuning parameters attached to the core, which
10559      we may later overwrite.  */
10560   aarch64_tune_params = *(selected_tune->tune);
10561   aarch64_architecture_version = selected_arch->architecture_version;
10562
10563   if (opts->x_aarch64_override_tune_string)
10564     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10565                                   &aarch64_tune_params);
10566
10567   /* This target defaults to strict volatile bitfields.  */
10568   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10569     opts->x_flag_strict_volatile_bitfields = 1;
10570
10571   initialize_aarch64_code_model (opts);
10572   initialize_aarch64_tls_size (opts);
10573
10574   int queue_depth = 0;
10575   switch (aarch64_tune_params.autoprefetcher_model)
10576     {
10577       case tune_params::AUTOPREFETCHER_OFF:
10578         queue_depth = -1;
10579         break;
10580       case tune_params::AUTOPREFETCHER_WEAK:
10581         queue_depth = 0;
10582         break;
10583       case tune_params::AUTOPREFETCHER_STRONG:
10584         queue_depth = max_insn_queue_index + 1;
10585         break;
10586       default:
10587         gcc_unreachable ();
10588     }
10589
10590   /* We don't mind passing in global_options_set here as we don't use
10591      the *options_set structs anyway.  */
10592   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10593                          queue_depth,
10594                          opts->x_param_values,
10595                          global_options_set.x_param_values);
10596
10597   /* Set up parameters to be used in prefetching algorithm.  Do not
10598      override the defaults unless we are tuning for a core we have
10599      researched values for.  */
10600   if (aarch64_tune_params.prefetch->num_slots > 0)
10601     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10602                            aarch64_tune_params.prefetch->num_slots,
10603                            opts->x_param_values,
10604                            global_options_set.x_param_values);
10605   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10606     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10607                            aarch64_tune_params.prefetch->l1_cache_size,
10608                            opts->x_param_values,
10609                            global_options_set.x_param_values);
10610   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10611     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10612                            aarch64_tune_params.prefetch->l1_cache_line_size,
10613                            opts->x_param_values,
10614                            global_options_set.x_param_values);
10615   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10616     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10617                            aarch64_tune_params.prefetch->l2_cache_size,
10618                            opts->x_param_values,
10619                            global_options_set.x_param_values);
10620
10621   /* Use the alternative scheduling-pressure algorithm by default.  */
10622   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10623                          opts->x_param_values,
10624                          global_options_set.x_param_values);
10625
10626   /* Enable sw prefetching at specified optimization level for
10627      CPUS that have prefetch.  Lower optimization level threshold by 1
10628      when profiling is enabled.  */
10629   if (opts->x_flag_prefetch_loop_arrays < 0
10630       && !opts->x_optimize_size
10631       && aarch64_tune_params.prefetch->default_opt_level >= 0
10632       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10633     opts->x_flag_prefetch_loop_arrays = 1;
10634
10635   aarch64_override_options_after_change_1 (opts);
10636 }
10637
10638 /* Print a hint with a suggestion for a core or architecture name that
10639    most closely resembles what the user passed in STR.  ARCH is true if
10640    the user is asking for an architecture name.  ARCH is false if the user
10641    is asking for a core name.  */
10642
10643 static void
10644 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10645 {
10646   auto_vec<const char *> candidates;
10647   const struct processor *entry = arch ? all_architectures : all_cores;
10648   for (; entry->name != NULL; entry++)
10649     candidates.safe_push (entry->name);
10650
10651 #ifdef HAVE_LOCAL_CPU_DETECT
10652   /* Add also "native" as possible value.  */
10653   if (arch)
10654     candidates.safe_push ("native");
10655 #endif
10656
10657   char *s;
10658   const char *hint = candidates_list_and_hint (str, s, candidates);
10659   if (hint)
10660     inform (input_location, "valid arguments are: %s;"
10661                              " did you mean %qs?", s, hint);
10662   else
10663     inform (input_location, "valid arguments are: %s", s);
10664
10665   XDELETEVEC (s);
10666 }
10667
10668 /* Print a hint with a suggestion for a core name that most closely resembles
10669    what the user passed in STR.  */
10670
10671 inline static void
10672 aarch64_print_hint_for_core (const char *str)
10673 {
10674   aarch64_print_hint_for_core_or_arch (str, false);
10675 }
10676
10677 /* Print a hint with a suggestion for an architecture name that most closely
10678    resembles what the user passed in STR.  */
10679
10680 inline static void
10681 aarch64_print_hint_for_arch (const char *str)
10682 {
10683   aarch64_print_hint_for_core_or_arch (str, true);
10684 }
10685
10686 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10687    specified in STR and throw errors if appropriate.  Put the results if
10688    they are valid in RES and ISA_FLAGS.  Return whether the option is
10689    valid.  */
10690
10691 static bool
10692 aarch64_validate_mcpu (const char *str, const struct processor **res,
10693                        unsigned long *isa_flags)
10694 {
10695   enum aarch64_parse_opt_result parse_res
10696     = aarch64_parse_cpu (str, res, isa_flags);
10697
10698   if (parse_res == AARCH64_PARSE_OK)
10699     return true;
10700
10701   switch (parse_res)
10702     {
10703       case AARCH64_PARSE_MISSING_ARG:
10704         error ("missing cpu name in %<-mcpu=%s%>", str);
10705         break;
10706       case AARCH64_PARSE_INVALID_ARG:
10707         error ("unknown value %qs for -mcpu", str);
10708         aarch64_print_hint_for_core (str);
10709         break;
10710       case AARCH64_PARSE_INVALID_FEATURE:
10711         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10712         break;
10713       default:
10714         gcc_unreachable ();
10715     }
10716
10717   return false;
10718 }
10719
10720 /* Validate a command-line -march option.  Parse the arch and extensions
10721    (if any) specified in STR and throw errors if appropriate.  Put the
10722    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10723    option is valid.  */
10724
10725 static bool
10726 aarch64_validate_march (const char *str, const struct processor **res,
10727                          unsigned long *isa_flags)
10728 {
10729   enum aarch64_parse_opt_result parse_res
10730     = aarch64_parse_arch (str, res, isa_flags);
10731
10732   if (parse_res == AARCH64_PARSE_OK)
10733     return true;
10734
10735   switch (parse_res)
10736     {
10737       case AARCH64_PARSE_MISSING_ARG:
10738         error ("missing arch name in %<-march=%s%>", str);
10739         break;
10740       case AARCH64_PARSE_INVALID_ARG:
10741         error ("unknown value %qs for -march", str);
10742         aarch64_print_hint_for_arch (str);
10743         break;
10744       case AARCH64_PARSE_INVALID_FEATURE:
10745         error ("invalid feature modifier in %<-march=%s%>", str);
10746         break;
10747       default:
10748         gcc_unreachable ();
10749     }
10750
10751   return false;
10752 }
10753
10754 /* Validate a command-line -mtune option.  Parse the cpu
10755    specified in STR and throw errors if appropriate.  Put the
10756    result, if it is valid, in RES.  Return whether the option is
10757    valid.  */
10758
10759 static bool
10760 aarch64_validate_mtune (const char *str, const struct processor **res)
10761 {
10762   enum aarch64_parse_opt_result parse_res
10763     = aarch64_parse_tune (str, res);
10764
10765   if (parse_res == AARCH64_PARSE_OK)
10766     return true;
10767
10768   switch (parse_res)
10769     {
10770       case AARCH64_PARSE_MISSING_ARG:
10771         error ("missing cpu name in %<-mtune=%s%>", str);
10772         break;
10773       case AARCH64_PARSE_INVALID_ARG:
10774         error ("unknown value %qs for -mtune", str);
10775         aarch64_print_hint_for_core (str);
10776         break;
10777       default:
10778         gcc_unreachable ();
10779     }
10780   return false;
10781 }
10782
10783 /* Return the CPU corresponding to the enum CPU.
10784    If it doesn't specify a cpu, return the default.  */
10785
10786 static const struct processor *
10787 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10788 {
10789   if (cpu != aarch64_none)
10790     return &all_cores[cpu];
10791
10792   /* The & 0x3f is to extract the bottom 6 bits that encode the
10793      default cpu as selected by the --with-cpu GCC configure option
10794      in config.gcc.
10795      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10796      flags mechanism should be reworked to make it more sane.  */
10797   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10798 }
10799
10800 /* Return the architecture corresponding to the enum ARCH.
10801    If it doesn't specify a valid architecture, return the default.  */
10802
10803 static const struct processor *
10804 aarch64_get_arch (enum aarch64_arch arch)
10805 {
10806   if (arch != aarch64_no_arch)
10807     return &all_architectures[arch];
10808
10809   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10810
10811   return &all_architectures[cpu->arch];
10812 }
10813
10814 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10815
10816 static poly_uint16
10817 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10818 {
10819   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10820      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10821      deciding which .md file patterns to use and when deciding whether
10822      something is a legitimate address or constant.  */
10823   if (value == SVE_SCALABLE || value == SVE_128)
10824     return poly_uint16 (2, 2);
10825   else
10826     return (int) value / 64;
10827 }
10828
10829 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10830    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10831    tuning structs.  In particular it must set selected_tune and
10832    aarch64_isa_flags that define the available ISA features and tuning
10833    decisions.  It must also set selected_arch as this will be used to
10834    output the .arch asm tags for each function.  */
10835
10836 static void
10837 aarch64_override_options (void)
10838 {
10839   unsigned long cpu_isa = 0;
10840   unsigned long arch_isa = 0;
10841   aarch64_isa_flags = 0;
10842
10843   bool valid_cpu = true;
10844   bool valid_tune = true;
10845   bool valid_arch = true;
10846
10847   selected_cpu = NULL;
10848   selected_arch = NULL;
10849   selected_tune = NULL;
10850
10851   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10852      If either of -march or -mtune is given, they override their
10853      respective component of -mcpu.  */
10854   if (aarch64_cpu_string)
10855     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10856                                         &cpu_isa);
10857
10858   if (aarch64_arch_string)
10859     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10860                                           &arch_isa);
10861
10862   if (aarch64_tune_string)
10863     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10864
10865   /* If the user did not specify a processor, choose the default
10866      one for them.  This will be the CPU set during configuration using
10867      --with-cpu, otherwise it is "generic".  */
10868   if (!selected_cpu)
10869     {
10870       if (selected_arch)
10871         {
10872           selected_cpu = &all_cores[selected_arch->ident];
10873           aarch64_isa_flags = arch_isa;
10874           explicit_arch = selected_arch->arch;
10875         }
10876       else
10877         {
10878           /* Get default configure-time CPU.  */
10879           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10880           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10881         }
10882
10883       if (selected_tune)
10884         explicit_tune_core = selected_tune->ident;
10885     }
10886   /* If both -mcpu and -march are specified check that they are architecturally
10887      compatible, warn if they're not and prefer the -march ISA flags.  */
10888   else if (selected_arch)
10889     {
10890       if (selected_arch->arch != selected_cpu->arch)
10891         {
10892           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10893                        all_architectures[selected_cpu->arch].name,
10894                        selected_arch->name);
10895         }
10896       aarch64_isa_flags = arch_isa;
10897       explicit_arch = selected_arch->arch;
10898       explicit_tune_core = selected_tune ? selected_tune->ident
10899                                           : selected_cpu->ident;
10900     }
10901   else
10902     {
10903       /* -mcpu but no -march.  */
10904       aarch64_isa_flags = cpu_isa;
10905       explicit_tune_core = selected_tune ? selected_tune->ident
10906                                           : selected_cpu->ident;
10907       gcc_assert (selected_cpu);
10908       selected_arch = &all_architectures[selected_cpu->arch];
10909       explicit_arch = selected_arch->arch;
10910     }
10911
10912   /* Set the arch as well as we will need it when outputing
10913      the .arch directive in assembly.  */
10914   if (!selected_arch)
10915     {
10916       gcc_assert (selected_cpu);
10917       selected_arch = &all_architectures[selected_cpu->arch];
10918     }
10919
10920   if (!selected_tune)
10921     selected_tune = selected_cpu;
10922
10923 #ifndef HAVE_AS_MABI_OPTION
10924   /* The compiler may have been configured with 2.23.* binutils, which does
10925      not have support for ILP32.  */
10926   if (TARGET_ILP32)
10927     error ("assembler does not support -mabi=ilp32");
10928 #endif
10929
10930   /* Convert -msve-vector-bits to a VG count.  */
10931   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10932
10933   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10934     sorry ("return address signing is only supported for -mabi=lp64");
10935
10936   /* Make sure we properly set up the explicit options.  */
10937   if ((aarch64_cpu_string && valid_cpu)
10938        || (aarch64_tune_string && valid_tune))
10939     gcc_assert (explicit_tune_core != aarch64_none);
10940
10941   if ((aarch64_cpu_string && valid_cpu)
10942        || (aarch64_arch_string && valid_arch))
10943     gcc_assert (explicit_arch != aarch64_no_arch);
10944
10945   aarch64_override_options_internal (&global_options);
10946
10947   /* Save these options as the default ones in case we push and pop them later
10948      while processing functions with potential target attributes.  */
10949   target_option_default_node = target_option_current_node
10950       = build_target_option_node (&global_options);
10951 }
10952
10953 /* Implement targetm.override_options_after_change.  */
10954
10955 static void
10956 aarch64_override_options_after_change (void)
10957 {
10958   aarch64_override_options_after_change_1 (&global_options);
10959 }
10960
10961 static struct machine_function *
10962 aarch64_init_machine_status (void)
10963 {
10964   struct machine_function *machine;
10965   machine = ggc_cleared_alloc<machine_function> ();
10966   return machine;
10967 }
10968
10969 void
10970 aarch64_init_expanders (void)
10971 {
10972   init_machine_status = aarch64_init_machine_status;
10973 }
10974
10975 /* A checking mechanism for the implementation of the various code models.  */
10976 static void
10977 initialize_aarch64_code_model (struct gcc_options *opts)
10978 {
10979    if (opts->x_flag_pic)
10980      {
10981        switch (opts->x_aarch64_cmodel_var)
10982          {
10983          case AARCH64_CMODEL_TINY:
10984            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10985            break;
10986          case AARCH64_CMODEL_SMALL:
10987 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10988            aarch64_cmodel = (flag_pic == 2
10989                              ? AARCH64_CMODEL_SMALL_PIC
10990                              : AARCH64_CMODEL_SMALL_SPIC);
10991 #else
10992            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10993 #endif
10994            break;
10995          case AARCH64_CMODEL_LARGE:
10996            sorry ("code model %qs with -f%s", "large",
10997                   opts->x_flag_pic > 1 ? "PIC" : "pic");
10998            break;
10999          default:
11000            gcc_unreachable ();
11001          }
11002      }
11003    else
11004      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11005 }
11006
11007 /* Implement TARGET_OPTION_SAVE.  */
11008
11009 static void
11010 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11011 {
11012   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11013 }
11014
11015 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11016    using the information saved in PTR.  */
11017
11018 static void
11019 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11020 {
11021   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11022   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11023   opts->x_explicit_arch = ptr->x_explicit_arch;
11024   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11025   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11026
11027   aarch64_override_options_internal (opts);
11028 }
11029
11030 /* Implement TARGET_OPTION_PRINT.  */
11031
11032 static void
11033 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11034 {
11035   const struct processor *cpu
11036     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11037   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11038   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11039   std::string extension
11040     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11041
11042   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11043   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11044            arch->name, extension.c_str ());
11045 }
11046
11047 static GTY(()) tree aarch64_previous_fndecl;
11048
11049 void
11050 aarch64_reset_previous_fndecl (void)
11051 {
11052   aarch64_previous_fndecl = NULL;
11053 }
11054
11055 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11056    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11057    make sure optab availability predicates are recomputed when necessary.  */
11058
11059 void
11060 aarch64_save_restore_target_globals (tree new_tree)
11061 {
11062   if (TREE_TARGET_GLOBALS (new_tree))
11063     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11064   else if (new_tree == target_option_default_node)
11065     restore_target_globals (&default_target_globals);
11066   else
11067     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11068 }
11069
11070 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11071    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11072    of the function, if such exists.  This function may be called multiple
11073    times on a single function so use aarch64_previous_fndecl to avoid
11074    setting up identical state.  */
11075
11076 static void
11077 aarch64_set_current_function (tree fndecl)
11078 {
11079   if (!fndecl || fndecl == aarch64_previous_fndecl)
11080     return;
11081
11082   tree old_tree = (aarch64_previous_fndecl
11083                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11084                    : NULL_TREE);
11085
11086   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11087
11088   /* If current function has no attributes but the previous one did,
11089      use the default node.  */
11090   if (!new_tree && old_tree)
11091     new_tree = target_option_default_node;
11092
11093   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11094      the default have been handled by aarch64_save_restore_target_globals from
11095      aarch64_pragma_target_parse.  */
11096   if (old_tree == new_tree)
11097     return;
11098
11099   aarch64_previous_fndecl = fndecl;
11100
11101   /* First set the target options.  */
11102   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11103
11104   aarch64_save_restore_target_globals (new_tree);
11105 }
11106
11107 /* Enum describing the various ways we can handle attributes.
11108    In many cases we can reuse the generic option handling machinery.  */
11109
11110 enum aarch64_attr_opt_type
11111 {
11112   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11113   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11114   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11115   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11116 };
11117
11118 /* All the information needed to handle a target attribute.
11119    NAME is the name of the attribute.
11120    ATTR_TYPE specifies the type of behavior of the attribute as described
11121    in the definition of enum aarch64_attr_opt_type.
11122    ALLOW_NEG is true if the attribute supports a "no-" form.
11123    HANDLER is the function that takes the attribute string as an argument
11124    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11125    OPT_NUM is the enum specifying the option that the attribute modifies.
11126    This is needed for attributes that mirror the behavior of a command-line
11127    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11128    aarch64_attr_enum.  */
11129
11130 struct aarch64_attribute_info
11131 {
11132   const char *name;
11133   enum aarch64_attr_opt_type attr_type;
11134   bool allow_neg;
11135   bool (*handler) (const char *);
11136   enum opt_code opt_num;
11137 };
11138
11139 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11140
11141 static bool
11142 aarch64_handle_attr_arch (const char *str)
11143 {
11144   const struct processor *tmp_arch = NULL;
11145   enum aarch64_parse_opt_result parse_res
11146     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11147
11148   if (parse_res == AARCH64_PARSE_OK)
11149     {
11150       gcc_assert (tmp_arch);
11151       selected_arch = tmp_arch;
11152       explicit_arch = selected_arch->arch;
11153       return true;
11154     }
11155
11156   switch (parse_res)
11157     {
11158       case AARCH64_PARSE_MISSING_ARG:
11159         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11160         break;
11161       case AARCH64_PARSE_INVALID_ARG:
11162         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11163         aarch64_print_hint_for_arch (str);
11164         break;
11165       case AARCH64_PARSE_INVALID_FEATURE:
11166         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11167         break;
11168       default:
11169         gcc_unreachable ();
11170     }
11171
11172   return false;
11173 }
11174
11175 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11176
11177 static bool
11178 aarch64_handle_attr_cpu (const char *str)
11179 {
11180   const struct processor *tmp_cpu = NULL;
11181   enum aarch64_parse_opt_result parse_res
11182     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11183
11184   if (parse_res == AARCH64_PARSE_OK)
11185     {
11186       gcc_assert (tmp_cpu);
11187       selected_tune = tmp_cpu;
11188       explicit_tune_core = selected_tune->ident;
11189
11190       selected_arch = &all_architectures[tmp_cpu->arch];
11191       explicit_arch = selected_arch->arch;
11192       return true;
11193     }
11194
11195   switch (parse_res)
11196     {
11197       case AARCH64_PARSE_MISSING_ARG:
11198         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11199         break;
11200       case AARCH64_PARSE_INVALID_ARG:
11201         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11202         aarch64_print_hint_for_core (str);
11203         break;
11204       case AARCH64_PARSE_INVALID_FEATURE:
11205         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11206         break;
11207       default:
11208         gcc_unreachable ();
11209     }
11210
11211   return false;
11212 }
11213
11214 /* Handle the argument STR to the tune= target attribute.  */
11215
11216 static bool
11217 aarch64_handle_attr_tune (const char *str)
11218 {
11219   const struct processor *tmp_tune = NULL;
11220   enum aarch64_parse_opt_result parse_res
11221     = aarch64_parse_tune (str, &tmp_tune);
11222
11223   if (parse_res == AARCH64_PARSE_OK)
11224     {
11225       gcc_assert (tmp_tune);
11226       selected_tune = tmp_tune;
11227       explicit_tune_core = selected_tune->ident;
11228       return true;
11229     }
11230
11231   switch (parse_res)
11232     {
11233       case AARCH64_PARSE_INVALID_ARG:
11234         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11235         aarch64_print_hint_for_core (str);
11236         break;
11237       default:
11238         gcc_unreachable ();
11239     }
11240
11241   return false;
11242 }
11243
11244 /* Parse an architecture extensions target attribute string specified in STR.
11245    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11246    if successful.  Update aarch64_isa_flags to reflect the ISA features
11247    modified.  */
11248
11249 static bool
11250 aarch64_handle_attr_isa_flags (char *str)
11251 {
11252   enum aarch64_parse_opt_result parse_res;
11253   unsigned long isa_flags = aarch64_isa_flags;
11254
11255   /* We allow "+nothing" in the beginning to clear out all architectural
11256      features if the user wants to handpick specific features.  */
11257   if (strncmp ("+nothing", str, 8) == 0)
11258     {
11259       isa_flags = 0;
11260       str += 8;
11261     }
11262
11263   parse_res = aarch64_parse_extension (str, &isa_flags);
11264
11265   if (parse_res == AARCH64_PARSE_OK)
11266     {
11267       aarch64_isa_flags = isa_flags;
11268       return true;
11269     }
11270
11271   switch (parse_res)
11272     {
11273       case AARCH64_PARSE_MISSING_ARG:
11274         error ("missing value in %<target()%> pragma or attribute");
11275         break;
11276
11277       case AARCH64_PARSE_INVALID_FEATURE:
11278         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11279         break;
11280
11281       default:
11282         gcc_unreachable ();
11283     }
11284
11285  return false;
11286 }
11287
11288 /* The target attributes that we support.  On top of these we also support just
11289    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11290    handled explicitly in aarch64_process_one_target_attr.  */
11291
11292 static const struct aarch64_attribute_info aarch64_attributes[] =
11293 {
11294   { "general-regs-only", aarch64_attr_mask, false, NULL,
11295      OPT_mgeneral_regs_only },
11296   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11297      OPT_mfix_cortex_a53_835769 },
11298   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11299      OPT_mfix_cortex_a53_843419 },
11300   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11301   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11302   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11303      OPT_momit_leaf_frame_pointer },
11304   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11305   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11306      OPT_march_ },
11307   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11308   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11309      OPT_mtune_ },
11310   { "sign-return-address", aarch64_attr_enum, false, NULL,
11311      OPT_msign_return_address_ },
11312   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11313 };
11314
11315 /* Parse ARG_STR which contains the definition of one target attribute.
11316    Show appropriate errors if any or return true if the attribute is valid.  */
11317
11318 static bool
11319 aarch64_process_one_target_attr (char *arg_str)
11320 {
11321   bool invert = false;
11322
11323   size_t len = strlen (arg_str);
11324
11325   if (len == 0)
11326     {
11327       error ("malformed %<target()%> pragma or attribute");
11328       return false;
11329     }
11330
11331   char *str_to_check = (char *) alloca (len + 1);
11332   strcpy (str_to_check, arg_str);
11333
11334   /* Skip leading whitespace.  */
11335   while (*str_to_check == ' ' || *str_to_check == '\t')
11336     str_to_check++;
11337
11338   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11339      It is easier to detect and handle it explicitly here rather than going
11340      through the machinery for the rest of the target attributes in this
11341      function.  */
11342   if (*str_to_check == '+')
11343     return aarch64_handle_attr_isa_flags (str_to_check);
11344
11345   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11346     {
11347       invert = true;
11348       str_to_check += 3;
11349     }
11350   char *arg = strchr (str_to_check, '=');
11351
11352   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11353      and point ARG to "foo".  */
11354   if (arg)
11355     {
11356       *arg = '\0';
11357       arg++;
11358     }
11359   const struct aarch64_attribute_info *p_attr;
11360   bool found = false;
11361   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11362     {
11363       /* If the names don't match up, or the user has given an argument
11364          to an attribute that doesn't accept one, or didn't give an argument
11365          to an attribute that expects one, fail to match.  */
11366       if (strcmp (str_to_check, p_attr->name) != 0)
11367         continue;
11368
11369       found = true;
11370       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11371                               || p_attr->attr_type == aarch64_attr_enum;
11372
11373       if (attr_need_arg_p ^ (arg != NULL))
11374         {
11375           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11376           return false;
11377         }
11378
11379       /* If the name matches but the attribute does not allow "no-" versions
11380          then we can't match.  */
11381       if (invert && !p_attr->allow_neg)
11382         {
11383           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11384           return false;
11385         }
11386
11387       switch (p_attr->attr_type)
11388         {
11389         /* Has a custom handler registered.
11390            For example, cpu=, arch=, tune=.  */
11391           case aarch64_attr_custom:
11392             gcc_assert (p_attr->handler);
11393             if (!p_attr->handler (arg))
11394               return false;
11395             break;
11396
11397           /* Either set or unset a boolean option.  */
11398           case aarch64_attr_bool:
11399             {
11400               struct cl_decoded_option decoded;
11401
11402               generate_option (p_attr->opt_num, NULL, !invert,
11403                                CL_TARGET, &decoded);
11404               aarch64_handle_option (&global_options, &global_options_set,
11405                                       &decoded, input_location);
11406               break;
11407             }
11408           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11409              should know what mask to apply given the option number.  */
11410           case aarch64_attr_mask:
11411             {
11412               struct cl_decoded_option decoded;
11413               /* We only need to specify the option number.
11414                  aarch64_handle_option will know which mask to apply.  */
11415               decoded.opt_index = p_attr->opt_num;
11416               decoded.value = !invert;
11417               aarch64_handle_option (&global_options, &global_options_set,
11418                                       &decoded, input_location);
11419               break;
11420             }
11421           /* Use the option setting machinery to set an option to an enum.  */
11422           case aarch64_attr_enum:
11423             {
11424               gcc_assert (arg);
11425               bool valid;
11426               int value;
11427               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11428                                               &value, CL_TARGET);
11429               if (valid)
11430                 {
11431                   set_option (&global_options, NULL, p_attr->opt_num, value,
11432                               NULL, DK_UNSPECIFIED, input_location,
11433                               global_dc);
11434                 }
11435               else
11436                 {
11437                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11438                 }
11439               break;
11440             }
11441           default:
11442             gcc_unreachable ();
11443         }
11444     }
11445
11446   /* If we reached here we either have found an attribute and validated
11447      it or didn't match any.  If we matched an attribute but its arguments
11448      were malformed we will have returned false already.  */
11449   return found;
11450 }
11451
11452 /* Count how many times the character C appears in
11453    NULL-terminated string STR.  */
11454
11455 static unsigned int
11456 num_occurences_in_str (char c, char *str)
11457 {
11458   unsigned int res = 0;
11459   while (*str != '\0')
11460     {
11461       if (*str == c)
11462         res++;
11463
11464       str++;
11465     }
11466
11467   return res;
11468 }
11469
11470 /* Parse the tree in ARGS that contains the target attribute information
11471    and update the global target options space.  */
11472
11473 bool
11474 aarch64_process_target_attr (tree args)
11475 {
11476   if (TREE_CODE (args) == TREE_LIST)
11477     {
11478       do
11479         {
11480           tree head = TREE_VALUE (args);
11481           if (head)
11482             {
11483               if (!aarch64_process_target_attr (head))
11484                 return false;
11485             }
11486           args = TREE_CHAIN (args);
11487         } while (args);
11488
11489       return true;
11490     }
11491
11492   if (TREE_CODE (args) != STRING_CST)
11493     {
11494       error ("attribute %<target%> argument not a string");
11495       return false;
11496     }
11497
11498   size_t len = strlen (TREE_STRING_POINTER (args));
11499   char *str_to_check = (char *) alloca (len + 1);
11500   strcpy (str_to_check, TREE_STRING_POINTER (args));
11501
11502   if (len == 0)
11503     {
11504       error ("malformed %<target()%> pragma or attribute");
11505       return false;
11506     }
11507
11508   /* Used to catch empty spaces between commas i.e.
11509      attribute ((target ("attr1,,attr2"))).  */
11510   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11511
11512   /* Handle multiple target attributes separated by ','.  */
11513   char *token = strtok (str_to_check, ",");
11514
11515   unsigned int num_attrs = 0;
11516   while (token)
11517     {
11518       num_attrs++;
11519       if (!aarch64_process_one_target_attr (token))
11520         {
11521           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11522           return false;
11523         }
11524
11525       token = strtok (NULL, ",");
11526     }
11527
11528   if (num_attrs != num_commas + 1)
11529     {
11530       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11531       return false;
11532     }
11533
11534   return true;
11535 }
11536
11537 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11538    process attribute ((target ("..."))).  */
11539
11540 static bool
11541 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11542 {
11543   struct cl_target_option cur_target;
11544   bool ret;
11545   tree old_optimize;
11546   tree new_target, new_optimize;
11547   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11548
11549   /* If what we're processing is the current pragma string then the
11550      target option node is already stored in target_option_current_node
11551      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11552      having to re-parse the string.  This is especially useful to keep
11553      arm_neon.h compile times down since that header contains a lot
11554      of intrinsics enclosed in pragmas.  */
11555   if (!existing_target && args == current_target_pragma)
11556     {
11557       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11558       return true;
11559     }
11560   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11561
11562   old_optimize = build_optimization_node (&global_options);
11563   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11564
11565   /* If the function changed the optimization levels as well as setting
11566      target options, start with the optimizations specified.  */
11567   if (func_optimize && func_optimize != old_optimize)
11568     cl_optimization_restore (&global_options,
11569                              TREE_OPTIMIZATION (func_optimize));
11570
11571   /* Save the current target options to restore at the end.  */
11572   cl_target_option_save (&cur_target, &global_options);
11573
11574   /* If fndecl already has some target attributes applied to it, unpack
11575      them so that we add this attribute on top of them, rather than
11576      overwriting them.  */
11577   if (existing_target)
11578     {
11579       struct cl_target_option *existing_options
11580         = TREE_TARGET_OPTION (existing_target);
11581
11582       if (existing_options)
11583         cl_target_option_restore (&global_options, existing_options);
11584     }
11585   else
11586     cl_target_option_restore (&global_options,
11587                         TREE_TARGET_OPTION (target_option_current_node));
11588
11589   ret = aarch64_process_target_attr (args);
11590
11591   /* Set up any additional state.  */
11592   if (ret)
11593     {
11594       aarch64_override_options_internal (&global_options);
11595       /* Initialize SIMD builtins if we haven't already.
11596          Set current_target_pragma to NULL for the duration so that
11597          the builtin initialization code doesn't try to tag the functions
11598          being built with the attributes specified by any current pragma, thus
11599          going into an infinite recursion.  */
11600       if (TARGET_SIMD)
11601         {
11602           tree saved_current_target_pragma = current_target_pragma;
11603           current_target_pragma = NULL;
11604           aarch64_init_simd_builtins ();
11605           current_target_pragma = saved_current_target_pragma;
11606         }
11607       new_target = build_target_option_node (&global_options);
11608     }
11609   else
11610     new_target = NULL;
11611
11612   new_optimize = build_optimization_node (&global_options);
11613
11614   if (fndecl && ret)
11615     {
11616       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11617
11618       if (old_optimize != new_optimize)
11619         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11620     }
11621
11622   cl_target_option_restore (&global_options, &cur_target);
11623
11624   if (old_optimize != new_optimize)
11625     cl_optimization_restore (&global_options,
11626                              TREE_OPTIMIZATION (old_optimize));
11627   return ret;
11628 }
11629
11630 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11631    tri-bool options (yes, no, don't care) and the default value is
11632    DEF, determine whether to reject inlining.  */
11633
11634 static bool
11635 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11636                                      int dont_care, int def)
11637 {
11638   /* If the callee doesn't care, always allow inlining.  */
11639   if (callee == dont_care)
11640     return true;
11641
11642   /* If the caller doesn't care, always allow inlining.  */
11643   if (caller == dont_care)
11644     return true;
11645
11646   /* Otherwise, allow inlining if either the callee and caller values
11647      agree, or if the callee is using the default value.  */
11648   return (callee == caller || callee == def);
11649 }
11650
11651 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11652    to inline CALLEE into CALLER based on target-specific info.
11653    Make sure that the caller and callee have compatible architectural
11654    features.  Then go through the other possible target attributes
11655    and see if they can block inlining.  Try not to reject always_inline
11656    callees unless they are incompatible architecturally.  */
11657
11658 static bool
11659 aarch64_can_inline_p (tree caller, tree callee)
11660 {
11661   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11662   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11663
11664   struct cl_target_option *caller_opts
11665         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11666                                            : target_option_default_node);
11667
11668   struct cl_target_option *callee_opts
11669         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11670                                            : target_option_default_node);
11671
11672   /* Callee's ISA flags should be a subset of the caller's.  */
11673   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11674        != callee_opts->x_aarch64_isa_flags)
11675     return false;
11676
11677   /* Allow non-strict aligned functions inlining into strict
11678      aligned ones.  */
11679   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11680        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11681       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11682            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11683     return false;
11684
11685   bool always_inline = lookup_attribute ("always_inline",
11686                                           DECL_ATTRIBUTES (callee));
11687
11688   /* If the architectural features match up and the callee is always_inline
11689      then the other attributes don't matter.  */
11690   if (always_inline)
11691     return true;
11692
11693   if (caller_opts->x_aarch64_cmodel_var
11694       != callee_opts->x_aarch64_cmodel_var)
11695     return false;
11696
11697   if (caller_opts->x_aarch64_tls_dialect
11698       != callee_opts->x_aarch64_tls_dialect)
11699     return false;
11700
11701   /* Honour explicit requests to workaround errata.  */
11702   if (!aarch64_tribools_ok_for_inlining_p (
11703           caller_opts->x_aarch64_fix_a53_err835769,
11704           callee_opts->x_aarch64_fix_a53_err835769,
11705           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11706     return false;
11707
11708   if (!aarch64_tribools_ok_for_inlining_p (
11709           caller_opts->x_aarch64_fix_a53_err843419,
11710           callee_opts->x_aarch64_fix_a53_err843419,
11711           2, TARGET_FIX_ERR_A53_843419))
11712     return false;
11713
11714   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11715      caller and calle and they don't match up, reject inlining.  */
11716   if (!aarch64_tribools_ok_for_inlining_p (
11717           caller_opts->x_flag_omit_leaf_frame_pointer,
11718           callee_opts->x_flag_omit_leaf_frame_pointer,
11719           2, 1))
11720     return false;
11721
11722   /* If the callee has specific tuning overrides, respect them.  */
11723   if (callee_opts->x_aarch64_override_tune_string != NULL
11724       && caller_opts->x_aarch64_override_tune_string == NULL)
11725     return false;
11726
11727   /* If the user specified tuning override strings for the
11728      caller and callee and they don't match up, reject inlining.
11729      We just do a string compare here, we don't analyze the meaning
11730      of the string, as it would be too costly for little gain.  */
11731   if (callee_opts->x_aarch64_override_tune_string
11732       && caller_opts->x_aarch64_override_tune_string
11733       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11734                   caller_opts->x_aarch64_override_tune_string) != 0))
11735     return false;
11736
11737   return true;
11738 }
11739
11740 /* Return true if SYMBOL_REF X binds locally.  */
11741
11742 static bool
11743 aarch64_symbol_binds_local_p (const_rtx x)
11744 {
11745   return (SYMBOL_REF_DECL (x)
11746           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11747           : SYMBOL_REF_LOCAL_P (x));
11748 }
11749
11750 /* Return true if SYMBOL_REF X is thread local */
11751 static bool
11752 aarch64_tls_symbol_p (rtx x)
11753 {
11754   if (! TARGET_HAVE_TLS)
11755     return false;
11756
11757   if (GET_CODE (x) != SYMBOL_REF)
11758     return false;
11759
11760   return SYMBOL_REF_TLS_MODEL (x) != 0;
11761 }
11762
11763 /* Classify a TLS symbol into one of the TLS kinds.  */
11764 enum aarch64_symbol_type
11765 aarch64_classify_tls_symbol (rtx x)
11766 {
11767   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11768
11769   switch (tls_kind)
11770     {
11771     case TLS_MODEL_GLOBAL_DYNAMIC:
11772     case TLS_MODEL_LOCAL_DYNAMIC:
11773       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11774
11775     case TLS_MODEL_INITIAL_EXEC:
11776       switch (aarch64_cmodel)
11777         {
11778         case AARCH64_CMODEL_TINY:
11779         case AARCH64_CMODEL_TINY_PIC:
11780           return SYMBOL_TINY_TLSIE;
11781         default:
11782           return SYMBOL_SMALL_TLSIE;
11783         }
11784
11785     case TLS_MODEL_LOCAL_EXEC:
11786       if (aarch64_tls_size == 12)
11787         return SYMBOL_TLSLE12;
11788       else if (aarch64_tls_size == 24)
11789         return SYMBOL_TLSLE24;
11790       else if (aarch64_tls_size == 32)
11791         return SYMBOL_TLSLE32;
11792       else if (aarch64_tls_size == 48)
11793         return SYMBOL_TLSLE48;
11794       else
11795         gcc_unreachable ();
11796
11797     case TLS_MODEL_EMULATED:
11798     case TLS_MODEL_NONE:
11799       return SYMBOL_FORCE_TO_MEM;
11800
11801     default:
11802       gcc_unreachable ();
11803     }
11804 }
11805
11806 /* Return the correct method for accessing X + OFFSET, where X is either
11807    a SYMBOL_REF or LABEL_REF.  */
11808
11809 enum aarch64_symbol_type
11810 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11811 {
11812   if (GET_CODE (x) == LABEL_REF)
11813     {
11814       switch (aarch64_cmodel)
11815         {
11816         case AARCH64_CMODEL_LARGE:
11817           return SYMBOL_FORCE_TO_MEM;
11818
11819         case AARCH64_CMODEL_TINY_PIC:
11820         case AARCH64_CMODEL_TINY:
11821           return SYMBOL_TINY_ABSOLUTE;
11822
11823         case AARCH64_CMODEL_SMALL_SPIC:
11824         case AARCH64_CMODEL_SMALL_PIC:
11825         case AARCH64_CMODEL_SMALL:
11826           return SYMBOL_SMALL_ABSOLUTE;
11827
11828         default:
11829           gcc_unreachable ();
11830         }
11831     }
11832
11833   if (GET_CODE (x) == SYMBOL_REF)
11834     {
11835       if (aarch64_tls_symbol_p (x))
11836         return aarch64_classify_tls_symbol (x);
11837
11838       switch (aarch64_cmodel)
11839         {
11840         case AARCH64_CMODEL_TINY:
11841           /* When we retrieve symbol + offset address, we have to make sure
11842              the offset does not cause overflow of the final address.  But
11843              we have no way of knowing the address of symbol at compile time
11844              so we can't accurately say if the distance between the PC and
11845              symbol + offset is outside the addressible range of +/-1M in the
11846              TINY code model.  So we rely on images not being greater than
11847              1M and cap the offset at 1M and anything beyond 1M will have to
11848              be loaded using an alternative mechanism.  Furthermore if the
11849              symbol is a weak reference to something that isn't known to
11850              resolve to a symbol in this module, then force to memory.  */
11851           if ((SYMBOL_REF_WEAK (x)
11852                && !aarch64_symbol_binds_local_p (x))
11853               || !IN_RANGE (offset, -1048575, 1048575))
11854             return SYMBOL_FORCE_TO_MEM;
11855           return SYMBOL_TINY_ABSOLUTE;
11856
11857         case AARCH64_CMODEL_SMALL:
11858           /* Same reasoning as the tiny code model, but the offset cap here is
11859              4G.  */
11860           if ((SYMBOL_REF_WEAK (x)
11861                && !aarch64_symbol_binds_local_p (x))
11862               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11863                             HOST_WIDE_INT_C (4294967264)))
11864             return SYMBOL_FORCE_TO_MEM;
11865           return SYMBOL_SMALL_ABSOLUTE;
11866
11867         case AARCH64_CMODEL_TINY_PIC:
11868           if (!aarch64_symbol_binds_local_p (x))
11869             return SYMBOL_TINY_GOT;
11870           return SYMBOL_TINY_ABSOLUTE;
11871
11872         case AARCH64_CMODEL_SMALL_SPIC:
11873         case AARCH64_CMODEL_SMALL_PIC:
11874           if (!aarch64_symbol_binds_local_p (x))
11875             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11876                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11877           return SYMBOL_SMALL_ABSOLUTE;
11878
11879         case AARCH64_CMODEL_LARGE:
11880           /* This is alright even in PIC code as the constant
11881              pool reference is always PC relative and within
11882              the same translation unit.  */
11883           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11884             return SYMBOL_SMALL_ABSOLUTE;
11885           else
11886             return SYMBOL_FORCE_TO_MEM;
11887
11888         default:
11889           gcc_unreachable ();
11890         }
11891     }
11892
11893   /* By default push everything into the constant pool.  */
11894   return SYMBOL_FORCE_TO_MEM;
11895 }
11896
11897 bool
11898 aarch64_constant_address_p (rtx x)
11899 {
11900   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11901 }
11902
11903 bool
11904 aarch64_legitimate_pic_operand_p (rtx x)
11905 {
11906   if (GET_CODE (x) == SYMBOL_REF
11907       || (GET_CODE (x) == CONST
11908           && GET_CODE (XEXP (x, 0)) == PLUS
11909           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11910      return false;
11911
11912   return true;
11913 }
11914
11915 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11916    that should be rematerialized rather than spilled.  */
11917
11918 static bool
11919 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11920 {
11921   /* Support CSE and rematerialization of common constants.  */
11922   if (CONST_INT_P (x)
11923       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11924       || GET_CODE (x) == CONST_VECTOR)
11925     return true;
11926
11927   /* Do not allow vector struct mode constants for Advanced SIMD.
11928      We could support 0 and -1 easily, but they need support in
11929      aarch64-simd.md.  */
11930   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11931   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11932     return false;
11933
11934   /* Only accept variable-length vector constants if they can be
11935      handled directly.
11936
11937      ??? It would be possible to handle rematerialization of other
11938      constants via secondary reloads.  */
11939   if (vec_flags & VEC_ANY_SVE)
11940     return aarch64_simd_valid_immediate (x, NULL);
11941
11942   if (GET_CODE (x) == HIGH)
11943     x = XEXP (x, 0);
11944
11945   /* Accept polynomial constants that can be calculated by using the
11946      destination of a move as the sole temporary.  Constants that
11947      require a second temporary cannot be rematerialized (they can't be
11948      forced to memory and also aren't legitimate constants).  */
11949   poly_int64 offset;
11950   if (poly_int_rtx_p (x, &offset))
11951     return aarch64_offset_temporaries (false, offset) <= 1;
11952
11953   /* If an offset is being added to something else, we need to allow the
11954      base to be moved into the destination register, meaning that there
11955      are no free temporaries for the offset.  */
11956   x = strip_offset (x, &offset);
11957   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11958     return false;
11959
11960   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11961   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11962     return false;
11963
11964   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11965      so spilling them is better than rematerialization.  */
11966   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11967     return true;
11968
11969   /* Label references are always constant.  */
11970   if (GET_CODE (x) == LABEL_REF)
11971     return true;
11972
11973   return false;
11974 }
11975
11976 rtx
11977 aarch64_load_tp (rtx target)
11978 {
11979   if (!target
11980       || GET_MODE (target) != Pmode
11981       || !register_operand (target, Pmode))
11982     target = gen_reg_rtx (Pmode);
11983
11984   /* Can return in any reg.  */
11985   emit_insn (gen_aarch64_load_tp_hard (target));
11986   return target;
11987 }
11988
11989 /* On AAPCS systems, this is the "struct __va_list".  */
11990 static GTY(()) tree va_list_type;
11991
11992 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11993    Return the type to use as __builtin_va_list.
11994
11995    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11996
11997    struct __va_list
11998    {
11999      void *__stack;
12000      void *__gr_top;
12001      void *__vr_top;
12002      int   __gr_offs;
12003      int   __vr_offs;
12004    };  */
12005
12006 static tree
12007 aarch64_build_builtin_va_list (void)
12008 {
12009   tree va_list_name;
12010   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12011
12012   /* Create the type.  */
12013   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12014   /* Give it the required name.  */
12015   va_list_name = build_decl (BUILTINS_LOCATION,
12016                              TYPE_DECL,
12017                              get_identifier ("__va_list"),
12018                              va_list_type);
12019   DECL_ARTIFICIAL (va_list_name) = 1;
12020   TYPE_NAME (va_list_type) = va_list_name;
12021   TYPE_STUB_DECL (va_list_type) = va_list_name;
12022
12023   /* Create the fields.  */
12024   f_stack = build_decl (BUILTINS_LOCATION,
12025                         FIELD_DECL, get_identifier ("__stack"),
12026                         ptr_type_node);
12027   f_grtop = build_decl (BUILTINS_LOCATION,
12028                         FIELD_DECL, get_identifier ("__gr_top"),
12029                         ptr_type_node);
12030   f_vrtop = build_decl (BUILTINS_LOCATION,
12031                         FIELD_DECL, get_identifier ("__vr_top"),
12032                         ptr_type_node);
12033   f_groff = build_decl (BUILTINS_LOCATION,
12034                         FIELD_DECL, get_identifier ("__gr_offs"),
12035                         integer_type_node);
12036   f_vroff = build_decl (BUILTINS_LOCATION,
12037                         FIELD_DECL, get_identifier ("__vr_offs"),
12038                         integer_type_node);
12039
12040   /* Tell tree-stdarg pass about our internal offset fields.
12041      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12042      purpose to identify whether the code is updating va_list internal
12043      offset fields through irregular way.  */
12044   va_list_gpr_counter_field = f_groff;
12045   va_list_fpr_counter_field = f_vroff;
12046
12047   DECL_ARTIFICIAL (f_stack) = 1;
12048   DECL_ARTIFICIAL (f_grtop) = 1;
12049   DECL_ARTIFICIAL (f_vrtop) = 1;
12050   DECL_ARTIFICIAL (f_groff) = 1;
12051   DECL_ARTIFICIAL (f_vroff) = 1;
12052
12053   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12054   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12055   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12056   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12057   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12058
12059   TYPE_FIELDS (va_list_type) = f_stack;
12060   DECL_CHAIN (f_stack) = f_grtop;
12061   DECL_CHAIN (f_grtop) = f_vrtop;
12062   DECL_CHAIN (f_vrtop) = f_groff;
12063   DECL_CHAIN (f_groff) = f_vroff;
12064
12065   /* Compute its layout.  */
12066   layout_type (va_list_type);
12067
12068   return va_list_type;
12069 }
12070
12071 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12072 static void
12073 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12074 {
12075   const CUMULATIVE_ARGS *cum;
12076   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12077   tree stack, grtop, vrtop, groff, vroff;
12078   tree t;
12079   int gr_save_area_size = cfun->va_list_gpr_size;
12080   int vr_save_area_size = cfun->va_list_fpr_size;
12081   int vr_offset;
12082
12083   cum = &crtl->args.info;
12084   if (cfun->va_list_gpr_size)
12085     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12086                              cfun->va_list_gpr_size);
12087   if (cfun->va_list_fpr_size)
12088     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12089                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12090
12091   if (!TARGET_FLOAT)
12092     {
12093       gcc_assert (cum->aapcs_nvrn == 0);
12094       vr_save_area_size = 0;
12095     }
12096
12097   f_stack = TYPE_FIELDS (va_list_type_node);
12098   f_grtop = DECL_CHAIN (f_stack);
12099   f_vrtop = DECL_CHAIN (f_grtop);
12100   f_groff = DECL_CHAIN (f_vrtop);
12101   f_vroff = DECL_CHAIN (f_groff);
12102
12103   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12104                   NULL_TREE);
12105   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12106                   NULL_TREE);
12107   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12108                   NULL_TREE);
12109   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12110                   NULL_TREE);
12111   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12112                   NULL_TREE);
12113
12114   /* Emit code to initialize STACK, which points to the next varargs stack
12115      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12116      by named arguments.  STACK is 8-byte aligned.  */
12117   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12118   if (cum->aapcs_stack_size > 0)
12119     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12120   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12121   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12122
12123   /* Emit code to initialize GRTOP, the top of the GR save area.
12124      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12125   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12126   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12127   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12128
12129   /* Emit code to initialize VRTOP, the top of the VR save area.
12130      This address is gr_save_area_bytes below GRTOP, rounded
12131      down to the next 16-byte boundary.  */
12132   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12133   vr_offset = ROUND_UP (gr_save_area_size,
12134                         STACK_BOUNDARY / BITS_PER_UNIT);
12135
12136   if (vr_offset)
12137     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12138   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12139   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12140
12141   /* Emit code to initialize GROFF, the offset from GRTOP of the
12142      next GPR argument.  */
12143   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12144               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12145   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12146
12147   /* Likewise emit code to initialize VROFF, the offset from FTOP
12148      of the next VR argument.  */
12149   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12150               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12151   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12152 }
12153
12154 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12155
12156 static tree
12157 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12158                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12159 {
12160   tree addr;
12161   bool indirect_p;
12162   bool is_ha;           /* is HFA or HVA.  */
12163   bool dw_align;        /* double-word align.  */
12164   machine_mode ag_mode = VOIDmode;
12165   int nregs;
12166   machine_mode mode;
12167
12168   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12169   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12170   HOST_WIDE_INT size, rsize, adjust, align;
12171   tree t, u, cond1, cond2;
12172
12173   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12174   if (indirect_p)
12175     type = build_pointer_type (type);
12176
12177   mode = TYPE_MODE (type);
12178
12179   f_stack = TYPE_FIELDS (va_list_type_node);
12180   f_grtop = DECL_CHAIN (f_stack);
12181   f_vrtop = DECL_CHAIN (f_grtop);
12182   f_groff = DECL_CHAIN (f_vrtop);
12183   f_vroff = DECL_CHAIN (f_groff);
12184
12185   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12186                   f_stack, NULL_TREE);
12187   size = int_size_in_bytes (type);
12188   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12189
12190   dw_align = false;
12191   adjust = 0;
12192   if (aarch64_vfp_is_call_or_return_candidate (mode,
12193                                                type,
12194                                                &ag_mode,
12195                                                &nregs,
12196                                                &is_ha))
12197     {
12198       /* No frontends can create types with variable-sized modes, so we
12199          shouldn't be asked to pass or return them.  */
12200       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12201
12202       /* TYPE passed in fp/simd registers.  */
12203       if (!TARGET_FLOAT)
12204         aarch64_err_no_fpadvsimd (mode, "varargs");
12205
12206       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12207                       unshare_expr (valist), f_vrtop, NULL_TREE);
12208       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12209                       unshare_expr (valist), f_vroff, NULL_TREE);
12210
12211       rsize = nregs * UNITS_PER_VREG;
12212
12213       if (is_ha)
12214         {
12215           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12216             adjust = UNITS_PER_VREG - ag_size;
12217         }
12218       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12219                && size < UNITS_PER_VREG)
12220         {
12221           adjust = UNITS_PER_VREG - size;
12222         }
12223     }
12224   else
12225     {
12226       /* TYPE passed in general registers.  */
12227       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12228                       unshare_expr (valist), f_grtop, NULL_TREE);
12229       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12230                       unshare_expr (valist), f_groff, NULL_TREE);
12231       rsize = ROUND_UP (size, UNITS_PER_WORD);
12232       nregs = rsize / UNITS_PER_WORD;
12233
12234       if (align > 8)
12235         dw_align = true;
12236
12237       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12238           && size < UNITS_PER_WORD)
12239         {
12240           adjust = UNITS_PER_WORD  - size;
12241         }
12242     }
12243
12244   /* Get a local temporary for the field value.  */
12245   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12246
12247   /* Emit code to branch if off >= 0.  */
12248   t = build2 (GE_EXPR, boolean_type_node, off,
12249               build_int_cst (TREE_TYPE (off), 0));
12250   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12251
12252   if (dw_align)
12253     {
12254       /* Emit: offs = (offs + 15) & -16.  */
12255       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12256                   build_int_cst (TREE_TYPE (off), 15));
12257       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12258                   build_int_cst (TREE_TYPE (off), -16));
12259       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12260     }
12261   else
12262     roundup = NULL;
12263
12264   /* Update ap.__[g|v]r_offs  */
12265   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12266               build_int_cst (TREE_TYPE (off), rsize));
12267   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12268
12269   /* String up.  */
12270   if (roundup)
12271     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12272
12273   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12274   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12275               build_int_cst (TREE_TYPE (f_off), 0));
12276   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12277
12278   /* String up: make sure the assignment happens before the use.  */
12279   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12280   COND_EXPR_ELSE (cond1) = t;
12281
12282   /* Prepare the trees handling the argument that is passed on the stack;
12283      the top level node will store in ON_STACK.  */
12284   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12285   if (align > 8)
12286     {
12287       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12288       t = fold_build_pointer_plus_hwi (arg, 15);
12289       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12290                   build_int_cst (TREE_TYPE (t), -16));
12291       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12292     }
12293   else
12294     roundup = NULL;
12295   /* Advance ap.__stack  */
12296   t = fold_build_pointer_plus_hwi (arg, size + 7);
12297   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12298               build_int_cst (TREE_TYPE (t), -8));
12299   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12300   /* String up roundup and advance.  */
12301   if (roundup)
12302     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12303   /* String up with arg */
12304   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12305   /* Big-endianness related address adjustment.  */
12306   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12307       && size < UNITS_PER_WORD)
12308   {
12309     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12310                 size_int (UNITS_PER_WORD - size));
12311     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12312   }
12313
12314   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12315   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12316
12317   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12318   t = off;
12319   if (adjust)
12320     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12321                 build_int_cst (TREE_TYPE (off), adjust));
12322
12323   t = fold_convert (sizetype, t);
12324   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12325
12326   if (is_ha)
12327     {
12328       /* type ha; // treat as "struct {ftype field[n];}"
12329          ... [computing offs]
12330          for (i = 0; i <nregs; ++i, offs += 16)
12331            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12332          return ha;  */
12333       int i;
12334       tree tmp_ha, field_t, field_ptr_t;
12335
12336       /* Declare a local variable.  */
12337       tmp_ha = create_tmp_var_raw (type, "ha");
12338       gimple_add_tmp_var (tmp_ha);
12339
12340       /* Establish the base type.  */
12341       switch (ag_mode)
12342         {
12343         case E_SFmode:
12344           field_t = float_type_node;
12345           field_ptr_t = float_ptr_type_node;
12346           break;
12347         case E_DFmode:
12348           field_t = double_type_node;
12349           field_ptr_t = double_ptr_type_node;
12350           break;
12351         case E_TFmode:
12352           field_t = long_double_type_node;
12353           field_ptr_t = long_double_ptr_type_node;
12354           break;
12355         case E_HFmode:
12356           field_t = aarch64_fp16_type_node;
12357           field_ptr_t = aarch64_fp16_ptr_type_node;
12358           break;
12359         case E_V2SImode:
12360         case E_V4SImode:
12361             {
12362               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12363               field_t = build_vector_type_for_mode (innertype, ag_mode);
12364               field_ptr_t = build_pointer_type (field_t);
12365             }
12366           break;
12367         default:
12368           gcc_assert (0);
12369         }
12370
12371       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12372       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12373       addr = t;
12374       t = fold_convert (field_ptr_t, addr);
12375       t = build2 (MODIFY_EXPR, field_t,
12376                   build1 (INDIRECT_REF, field_t, tmp_ha),
12377                   build1 (INDIRECT_REF, field_t, t));
12378
12379       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12380       for (i = 1; i < nregs; ++i)
12381         {
12382           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12383           u = fold_convert (field_ptr_t, addr);
12384           u = build2 (MODIFY_EXPR, field_t,
12385                       build2 (MEM_REF, field_t, tmp_ha,
12386                               build_int_cst (field_ptr_t,
12387                                              (i *
12388                                               int_size_in_bytes (field_t)))),
12389                       build1 (INDIRECT_REF, field_t, u));
12390           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12391         }
12392
12393       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12394       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12395     }
12396
12397   COND_EXPR_ELSE (cond2) = t;
12398   addr = fold_convert (build_pointer_type (type), cond1);
12399   addr = build_va_arg_indirect_ref (addr);
12400
12401   if (indirect_p)
12402     addr = build_va_arg_indirect_ref (addr);
12403
12404   return addr;
12405 }
12406
12407 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12408
12409 static void
12410 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12411                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12412                                 int no_rtl)
12413 {
12414   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12415   CUMULATIVE_ARGS local_cum;
12416   int gr_saved = cfun->va_list_gpr_size;
12417   int vr_saved = cfun->va_list_fpr_size;
12418
12419   /* The caller has advanced CUM up to, but not beyond, the last named
12420      argument.  Advance a local copy of CUM past the last "real" named
12421      argument, to find out how many registers are left over.  */
12422   local_cum = *cum;
12423   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12424
12425   /* Found out how many registers we need to save.
12426      Honor tree-stdvar analysis results.  */
12427   if (cfun->va_list_gpr_size)
12428     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12429                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12430   if (cfun->va_list_fpr_size)
12431     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12432                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12433
12434   if (!TARGET_FLOAT)
12435     {
12436       gcc_assert (local_cum.aapcs_nvrn == 0);
12437       vr_saved = 0;
12438     }
12439
12440   if (!no_rtl)
12441     {
12442       if (gr_saved > 0)
12443         {
12444           rtx ptr, mem;
12445
12446           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12447           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12448                                - gr_saved * UNITS_PER_WORD);
12449           mem = gen_frame_mem (BLKmode, ptr);
12450           set_mem_alias_set (mem, get_varargs_alias_set ());
12451
12452           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12453                                mem, gr_saved);
12454         }
12455       if (vr_saved > 0)
12456         {
12457           /* We can't use move_block_from_reg, because it will use
12458              the wrong mode, storing D regs only.  */
12459           machine_mode mode = TImode;
12460           int off, i, vr_start;
12461
12462           /* Set OFF to the offset from virtual_incoming_args_rtx of
12463              the first vector register.  The VR save area lies below
12464              the GR one, and is aligned to 16 bytes.  */
12465           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12466                            STACK_BOUNDARY / BITS_PER_UNIT);
12467           off -= vr_saved * UNITS_PER_VREG;
12468
12469           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12470           for (i = 0; i < vr_saved; ++i)
12471             {
12472               rtx ptr, mem;
12473
12474               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12475               mem = gen_frame_mem (mode, ptr);
12476               set_mem_alias_set (mem, get_varargs_alias_set ());
12477               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12478               off += UNITS_PER_VREG;
12479             }
12480         }
12481     }
12482
12483   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12484      any complication of having crtl->args.pretend_args_size changed.  */
12485   cfun->machine->frame.saved_varargs_size
12486     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12487                  STACK_BOUNDARY / BITS_PER_UNIT)
12488        + vr_saved * UNITS_PER_VREG);
12489 }
12490
12491 static void
12492 aarch64_conditional_register_usage (void)
12493 {
12494   int i;
12495   if (!TARGET_FLOAT)
12496     {
12497       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12498         {
12499           fixed_regs[i] = 1;
12500           call_used_regs[i] = 1;
12501         }
12502     }
12503   if (!TARGET_SVE)
12504     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12505       {
12506         fixed_regs[i] = 1;
12507         call_used_regs[i] = 1;
12508       }
12509 }
12510
12511 /* Walk down the type tree of TYPE counting consecutive base elements.
12512    If *MODEP is VOIDmode, then set it to the first valid floating point
12513    type.  If a non-floating point type is found, or if a floating point
12514    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12515    otherwise return the count in the sub-tree.  */
12516 static int
12517 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12518 {
12519   machine_mode mode;
12520   HOST_WIDE_INT size;
12521
12522   switch (TREE_CODE (type))
12523     {
12524     case REAL_TYPE:
12525       mode = TYPE_MODE (type);
12526       if (mode != DFmode && mode != SFmode
12527           && mode != TFmode && mode != HFmode)
12528         return -1;
12529
12530       if (*modep == VOIDmode)
12531         *modep = mode;
12532
12533       if (*modep == mode)
12534         return 1;
12535
12536       break;
12537
12538     case COMPLEX_TYPE:
12539       mode = TYPE_MODE (TREE_TYPE (type));
12540       if (mode != DFmode && mode != SFmode
12541           && mode != TFmode && mode != HFmode)
12542         return -1;
12543
12544       if (*modep == VOIDmode)
12545         *modep = mode;
12546
12547       if (*modep == mode)
12548         return 2;
12549
12550       break;
12551
12552     case VECTOR_TYPE:
12553       /* Use V2SImode and V4SImode as representatives of all 64-bit
12554          and 128-bit vector types.  */
12555       size = int_size_in_bytes (type);
12556       switch (size)
12557         {
12558         case 8:
12559           mode = V2SImode;
12560           break;
12561         case 16:
12562           mode = V4SImode;
12563           break;
12564         default:
12565           return -1;
12566         }
12567
12568       if (*modep == VOIDmode)
12569         *modep = mode;
12570
12571       /* Vector modes are considered to be opaque: two vectors are
12572          equivalent for the purposes of being homogeneous aggregates
12573          if they are the same size.  */
12574       if (*modep == mode)
12575         return 1;
12576
12577       break;
12578
12579     case ARRAY_TYPE:
12580       {
12581         int count;
12582         tree index = TYPE_DOMAIN (type);
12583
12584         /* Can't handle incomplete types nor sizes that are not
12585            fixed.  */
12586         if (!COMPLETE_TYPE_P (type)
12587             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12588           return -1;
12589
12590         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12591         if (count == -1
12592             || !index
12593             || !TYPE_MAX_VALUE (index)
12594             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12595             || !TYPE_MIN_VALUE (index)
12596             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12597             || count < 0)
12598           return -1;
12599
12600         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12601                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12602
12603         /* There must be no padding.  */
12604         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12605                       count * GET_MODE_BITSIZE (*modep)))
12606           return -1;
12607
12608         return count;
12609       }
12610
12611     case RECORD_TYPE:
12612       {
12613         int count = 0;
12614         int sub_count;
12615         tree field;
12616
12617         /* Can't handle incomplete types nor sizes that are not
12618            fixed.  */
12619         if (!COMPLETE_TYPE_P (type)
12620             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12621           return -1;
12622
12623         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12624           {
12625             if (TREE_CODE (field) != FIELD_DECL)
12626               continue;
12627
12628             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12629             if (sub_count < 0)
12630               return -1;
12631             count += sub_count;
12632           }
12633
12634         /* There must be no padding.  */
12635         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12636                       count * GET_MODE_BITSIZE (*modep)))
12637           return -1;
12638
12639         return count;
12640       }
12641
12642     case UNION_TYPE:
12643     case QUAL_UNION_TYPE:
12644       {
12645         /* These aren't very interesting except in a degenerate case.  */
12646         int count = 0;
12647         int sub_count;
12648         tree field;
12649
12650         /* Can't handle incomplete types nor sizes that are not
12651            fixed.  */
12652         if (!COMPLETE_TYPE_P (type)
12653             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12654           return -1;
12655
12656         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12657           {
12658             if (TREE_CODE (field) != FIELD_DECL)
12659               continue;
12660
12661             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12662             if (sub_count < 0)
12663               return -1;
12664             count = count > sub_count ? count : sub_count;
12665           }
12666
12667         /* There must be no padding.  */
12668         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12669                       count * GET_MODE_BITSIZE (*modep)))
12670           return -1;
12671
12672         return count;
12673       }
12674
12675     default:
12676       break;
12677     }
12678
12679   return -1;
12680 }
12681
12682 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12683    type as described in AAPCS64 \S 4.1.2.
12684
12685    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12686
12687 static bool
12688 aarch64_short_vector_p (const_tree type,
12689                         machine_mode mode)
12690 {
12691   poly_int64 size = -1;
12692
12693   if (type && TREE_CODE (type) == VECTOR_TYPE)
12694     size = int_size_in_bytes (type);
12695   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12696             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12697     size = GET_MODE_SIZE (mode);
12698
12699   return known_eq (size, 8) || known_eq (size, 16);
12700 }
12701
12702 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12703    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12704    array types.  The C99 floating-point complex types are also considered
12705    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12706    types, which are GCC extensions and out of the scope of AAPCS64, are
12707    treated as composite types here as well.
12708
12709    Note that MODE itself is not sufficient in determining whether a type
12710    is such a composite type or not.  This is because
12711    stor-layout.c:compute_record_mode may have already changed the MODE
12712    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12713    structure with only one field may have its MODE set to the mode of the
12714    field.  Also an integer mode whose size matches the size of the
12715    RECORD_TYPE type may be used to substitute the original mode
12716    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12717    solely relied on.  */
12718
12719 static bool
12720 aarch64_composite_type_p (const_tree type,
12721                           machine_mode mode)
12722 {
12723   if (aarch64_short_vector_p (type, mode))
12724     return false;
12725
12726   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12727     return true;
12728
12729   if (mode == BLKmode
12730       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12731       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12732     return true;
12733
12734   return false;
12735 }
12736
12737 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12738    shall be passed or returned in simd/fp register(s) (providing these
12739    parameter passing registers are available).
12740
12741    Upon successful return, *COUNT returns the number of needed registers,
12742    *BASE_MODE returns the mode of the individual register and when IS_HAF
12743    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12744    floating-point aggregate or a homogeneous short-vector aggregate.  */
12745
12746 static bool
12747 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12748                                          const_tree type,
12749                                          machine_mode *base_mode,
12750                                          int *count,
12751                                          bool *is_ha)
12752 {
12753   machine_mode new_mode = VOIDmode;
12754   bool composite_p = aarch64_composite_type_p (type, mode);
12755
12756   if (is_ha != NULL) *is_ha = false;
12757
12758   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12759       || aarch64_short_vector_p (type, mode))
12760     {
12761       *count = 1;
12762       new_mode = mode;
12763     }
12764   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12765     {
12766       if (is_ha != NULL) *is_ha = true;
12767       *count = 2;
12768       new_mode = GET_MODE_INNER (mode);
12769     }
12770   else if (type && composite_p)
12771     {
12772       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12773
12774       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12775         {
12776           if (is_ha != NULL) *is_ha = true;
12777           *count = ag_count;
12778         }
12779       else
12780         return false;
12781     }
12782   else
12783     return false;
12784
12785   *base_mode = new_mode;
12786   return true;
12787 }
12788
12789 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12790
12791 static rtx
12792 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12793                           int incoming ATTRIBUTE_UNUSED)
12794 {
12795   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12796 }
12797
12798 /* Implements target hook vector_mode_supported_p.  */
12799 static bool
12800 aarch64_vector_mode_supported_p (machine_mode mode)
12801 {
12802   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12803   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12804 }
12805
12806 /* Return appropriate SIMD container
12807    for MODE within a vector of WIDTH bits.  */
12808 static machine_mode
12809 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12810 {
12811   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12812     switch (mode)
12813       {
12814       case E_DFmode:
12815         return VNx2DFmode;
12816       case E_SFmode:
12817         return VNx4SFmode;
12818       case E_HFmode:
12819         return VNx8HFmode;
12820       case E_DImode:
12821         return VNx2DImode;
12822       case E_SImode:
12823         return VNx4SImode;
12824       case E_HImode:
12825         return VNx8HImode;
12826       case E_QImode:
12827         return VNx16QImode;
12828       default:
12829         return word_mode;
12830       }
12831
12832   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12833   if (TARGET_SIMD)
12834     {
12835       if (known_eq (width, 128))
12836         switch (mode)
12837           {
12838           case E_DFmode:
12839             return V2DFmode;
12840           case E_SFmode:
12841             return V4SFmode;
12842           case E_HFmode:
12843             return V8HFmode;
12844           case E_SImode:
12845             return V4SImode;
12846           case E_HImode:
12847             return V8HImode;
12848           case E_QImode:
12849             return V16QImode;
12850           case E_DImode:
12851             return V2DImode;
12852           default:
12853             break;
12854           }
12855       else
12856         switch (mode)
12857           {
12858           case E_SFmode:
12859             return V2SFmode;
12860           case E_HFmode:
12861             return V4HFmode;
12862           case E_SImode:
12863             return V2SImode;
12864           case E_HImode:
12865             return V4HImode;
12866           case E_QImode:
12867             return V8QImode;
12868           default:
12869             break;
12870           }
12871     }
12872   return word_mode;
12873 }
12874
12875 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12876 static machine_mode
12877 aarch64_preferred_simd_mode (scalar_mode mode)
12878 {
12879   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12880   return aarch64_simd_container_mode (mode, bits);
12881 }
12882
12883 /* Return a list of possible vector sizes for the vectorizer
12884    to iterate over.  */
12885 static void
12886 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12887 {
12888   if (TARGET_SVE)
12889     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12890   sizes->safe_push (16);
12891   sizes->safe_push (8);
12892 }
12893
12894 /* Implement TARGET_MANGLE_TYPE.  */
12895
12896 static const char *
12897 aarch64_mangle_type (const_tree type)
12898 {
12899   /* The AArch64 ABI documents say that "__va_list" has to be
12900      managled as if it is in the "std" namespace.  */
12901   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12902     return "St9__va_list";
12903
12904   /* Half-precision float.  */
12905   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12906     return "Dh";
12907
12908   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12909      builtin types.  */
12910   if (TYPE_NAME (type) != NULL)
12911     return aarch64_mangle_builtin_type (type);
12912
12913   /* Use the default mangling.  */
12914   return NULL;
12915 }
12916
12917 /* Find the first rtx_insn before insn that will generate an assembly
12918    instruction.  */
12919
12920 static rtx_insn *
12921 aarch64_prev_real_insn (rtx_insn *insn)
12922 {
12923   if (!insn)
12924     return NULL;
12925
12926   do
12927     {
12928       insn = prev_real_insn (insn);
12929     }
12930   while (insn && recog_memoized (insn) < 0);
12931
12932   return insn;
12933 }
12934
12935 static bool
12936 is_madd_op (enum attr_type t1)
12937 {
12938   unsigned int i;
12939   /* A number of these may be AArch32 only.  */
12940   enum attr_type mlatypes[] = {
12941     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12942     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12943     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12944   };
12945
12946   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12947     {
12948       if (t1 == mlatypes[i])
12949         return true;
12950     }
12951
12952   return false;
12953 }
12954
12955 /* Check if there is a register dependency between a load and the insn
12956    for which we hold recog_data.  */
12957
12958 static bool
12959 dep_between_memop_and_curr (rtx memop)
12960 {
12961   rtx load_reg;
12962   int opno;
12963
12964   gcc_assert (GET_CODE (memop) == SET);
12965
12966   if (!REG_P (SET_DEST (memop)))
12967     return false;
12968
12969   load_reg = SET_DEST (memop);
12970   for (opno = 1; opno < recog_data.n_operands; opno++)
12971     {
12972       rtx operand = recog_data.operand[opno];
12973       if (REG_P (operand)
12974           && reg_overlap_mentioned_p (load_reg, operand))
12975         return true;
12976
12977     }
12978   return false;
12979 }
12980
12981
12982 /* When working around the Cortex-A53 erratum 835769,
12983    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12984    instruction and has a preceding memory instruction such that a NOP
12985    should be inserted between them.  */
12986
12987 bool
12988 aarch64_madd_needs_nop (rtx_insn* insn)
12989 {
12990   enum attr_type attr_type;
12991   rtx_insn *prev;
12992   rtx body;
12993
12994   if (!TARGET_FIX_ERR_A53_835769)
12995     return false;
12996
12997   if (!INSN_P (insn) || recog_memoized (insn) < 0)
12998     return false;
12999
13000   attr_type = get_attr_type (insn);
13001   if (!is_madd_op (attr_type))
13002     return false;
13003
13004   prev = aarch64_prev_real_insn (insn);
13005   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13006      Restore recog state to INSN to avoid state corruption.  */
13007   extract_constrain_insn_cached (insn);
13008
13009   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13010     return false;
13011
13012   body = single_set (prev);
13013
13014   /* If the previous insn is a memory op and there is no dependency between
13015      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13016      have a complex memory operation, probably a load/store pair.
13017      Be conservative for now and emit a NOP.  */
13018   if (GET_MODE (recog_data.operand[0]) == DImode
13019       && (!body || !dep_between_memop_and_curr (body)))
13020     return true;
13021
13022   return false;
13023
13024 }
13025
13026
13027 /* Implement FINAL_PRESCAN_INSN.  */
13028
13029 void
13030 aarch64_final_prescan_insn (rtx_insn *insn)
13031 {
13032   if (aarch64_madd_needs_nop (insn))
13033     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13034 }
13035
13036
13037 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13038    instruction.  */
13039
13040 bool
13041 aarch64_sve_index_immediate_p (rtx base_or_step)
13042 {
13043   return (CONST_INT_P (base_or_step)
13044           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13045 }
13046
13047 /* Return true if X is a valid immediate for the SVE ADD and SUB
13048    instructions.  Negate X first if NEGATE_P is true.  */
13049
13050 bool
13051 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13052 {
13053   rtx elt;
13054
13055   if (!const_vec_duplicate_p (x, &elt)
13056       || !CONST_INT_P (elt))
13057     return false;
13058
13059   HOST_WIDE_INT val = INTVAL (elt);
13060   if (negate_p)
13061     val = -val;
13062   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13063
13064   if (val & 0xff)
13065     return IN_RANGE (val, 0, 0xff);
13066   return IN_RANGE (val, 0, 0xff00);
13067 }
13068
13069 /* Return true if X is a valid immediate operand for an SVE logical
13070    instruction such as AND.  */
13071
13072 bool
13073 aarch64_sve_bitmask_immediate_p (rtx x)
13074 {
13075   rtx elt;
13076
13077   return (const_vec_duplicate_p (x, &elt)
13078           && CONST_INT_P (elt)
13079           && aarch64_bitmask_imm (INTVAL (elt),
13080                                   GET_MODE_INNER (GET_MODE (x))));
13081 }
13082
13083 /* Return true if X is a valid immediate for the SVE DUP and CPY
13084    instructions.  */
13085
13086 bool
13087 aarch64_sve_dup_immediate_p (rtx x)
13088 {
13089   rtx elt;
13090
13091   if (!const_vec_duplicate_p (x, &elt)
13092       || !CONST_INT_P (elt))
13093     return false;
13094
13095   HOST_WIDE_INT val = INTVAL (elt);
13096   if (val & 0xff)
13097     return IN_RANGE (val, -0x80, 0x7f);
13098   return IN_RANGE (val, -0x8000, 0x7f00);
13099 }
13100
13101 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13102    SIGNED_P says whether the operand is signed rather than unsigned.  */
13103
13104 bool
13105 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13106 {
13107   rtx elt;
13108
13109   return (const_vec_duplicate_p (x, &elt)
13110           && CONST_INT_P (elt)
13111           && (signed_p
13112               ? IN_RANGE (INTVAL (elt), -16, 15)
13113               : IN_RANGE (INTVAL (elt), 0, 127)));
13114 }
13115
13116 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13117    instruction.  Negate X first if NEGATE_P is true.  */
13118
13119 bool
13120 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13121 {
13122   rtx elt;
13123   REAL_VALUE_TYPE r;
13124
13125   if (!const_vec_duplicate_p (x, &elt)
13126       || GET_CODE (elt) != CONST_DOUBLE)
13127     return false;
13128
13129   r = *CONST_DOUBLE_REAL_VALUE (elt);
13130
13131   if (negate_p)
13132     r = real_value_negate (&r);
13133
13134   if (real_equal (&r, &dconst1))
13135     return true;
13136   if (real_equal (&r, &dconsthalf))
13137     return true;
13138   return false;
13139 }
13140
13141 /* Return true if X is a valid immediate operand for an SVE FMUL
13142    instruction.  */
13143
13144 bool
13145 aarch64_sve_float_mul_immediate_p (rtx x)
13146 {
13147   rtx elt;
13148
13149   /* GCC will never generate a multiply with an immediate of 2, so there is no
13150      point testing for it (even though it is a valid constant).  */
13151   return (const_vec_duplicate_p (x, &elt)
13152           && GET_CODE (elt) == CONST_DOUBLE
13153           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13154 }
13155
13156 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13157    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13158    is nonnull, use it to describe valid immediates.  */
13159 static bool
13160 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13161                                     simd_immediate_info *info,
13162                                     enum simd_immediate_check which,
13163                                     simd_immediate_info::insn_type insn)
13164 {
13165   /* Try a 4-byte immediate with LSL.  */
13166   for (unsigned int shift = 0; shift < 32; shift += 8)
13167     if ((val32 & (0xff << shift)) == val32)
13168       {
13169         if (info)
13170           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13171                                        simd_immediate_info::LSL, shift);
13172         return true;
13173       }
13174
13175   /* Try a 2-byte immediate with LSL.  */
13176   unsigned int imm16 = val32 & 0xffff;
13177   if (imm16 == (val32 >> 16))
13178     for (unsigned int shift = 0; shift < 16; shift += 8)
13179       if ((imm16 & (0xff << shift)) == imm16)
13180         {
13181           if (info)
13182             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13183                                          simd_immediate_info::LSL, shift);
13184           return true;
13185         }
13186
13187   /* Try a 4-byte immediate with MSL, except for cases that MVN
13188      can handle.  */
13189   if (which == AARCH64_CHECK_MOV)
13190     for (unsigned int shift = 8; shift < 24; shift += 8)
13191       {
13192         unsigned int low = (1 << shift) - 1;
13193         if (((val32 & (0xff << shift)) | low) == val32)
13194           {
13195             if (info)
13196               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13197                                            simd_immediate_info::MSL, shift);
13198             return true;
13199           }
13200       }
13201
13202   return false;
13203 }
13204
13205 /* Return true if replicating VAL64 is a valid immediate for the
13206    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13207    use it to describe valid immediates.  */
13208 static bool
13209 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13210                                  simd_immediate_info *info,
13211                                  enum simd_immediate_check which)
13212 {
13213   unsigned int val32 = val64 & 0xffffffff;
13214   unsigned int val16 = val64 & 0xffff;
13215   unsigned int val8 = val64 & 0xff;
13216
13217   if (val32 == (val64 >> 32))
13218     {
13219       if ((which & AARCH64_CHECK_ORR) != 0
13220           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13221                                                  simd_immediate_info::MOV))
13222         return true;
13223
13224       if ((which & AARCH64_CHECK_BIC) != 0
13225           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13226                                                  simd_immediate_info::MVN))
13227         return true;
13228
13229       /* Try using a replicated byte.  */
13230       if (which == AARCH64_CHECK_MOV
13231           && val16 == (val32 >> 16)
13232           && val8 == (val16 >> 8))
13233         {
13234           if (info)
13235             *info = simd_immediate_info (QImode, val8);
13236           return true;
13237         }
13238     }
13239
13240   /* Try using a bit-to-bytemask.  */
13241   if (which == AARCH64_CHECK_MOV)
13242     {
13243       unsigned int i;
13244       for (i = 0; i < 64; i += 8)
13245         {
13246           unsigned char byte = (val64 >> i) & 0xff;
13247           if (byte != 0 && byte != 0xff)
13248             break;
13249         }
13250       if (i == 64)
13251         {
13252           if (info)
13253             *info = simd_immediate_info (DImode, val64);
13254           return true;
13255         }
13256     }
13257   return false;
13258 }
13259
13260 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13261    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13262
13263 static bool
13264 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13265                              simd_immediate_info *info)
13266 {
13267   scalar_int_mode mode = DImode;
13268   unsigned int val32 = val64 & 0xffffffff;
13269   if (val32 == (val64 >> 32))
13270     {
13271       mode = SImode;
13272       unsigned int val16 = val32 & 0xffff;
13273       if (val16 == (val32 >> 16))
13274         {
13275           mode = HImode;
13276           unsigned int val8 = val16 & 0xff;
13277           if (val8 == (val16 >> 8))
13278             mode = QImode;
13279         }
13280     }
13281   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13282   if (IN_RANGE (val, -0x80, 0x7f))
13283     {
13284       /* DUP with no shift.  */
13285       if (info)
13286         *info = simd_immediate_info (mode, val);
13287       return true;
13288     }
13289   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13290     {
13291       /* DUP with LSL #8.  */
13292       if (info)
13293         *info = simd_immediate_info (mode, val);
13294       return true;
13295     }
13296   if (aarch64_bitmask_imm (val64, mode))
13297     {
13298       /* DUPM.  */
13299       if (info)
13300         *info = simd_immediate_info (mode, val);
13301       return true;
13302     }
13303   return false;
13304 }
13305
13306 /* Return true if OP is a valid SIMD immediate for the operation
13307    described by WHICH.  If INFO is nonnull, use it to describe valid
13308    immediates.  */
13309 bool
13310 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13311                               enum simd_immediate_check which)
13312 {
13313   machine_mode mode = GET_MODE (op);
13314   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13315   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13316     return false;
13317
13318   scalar_mode elt_mode = GET_MODE_INNER (mode);
13319   rtx base, step;
13320   unsigned int n_elts;
13321   if (GET_CODE (op) == CONST_VECTOR
13322       && CONST_VECTOR_DUPLICATE_P (op))
13323     n_elts = CONST_VECTOR_NPATTERNS (op);
13324   else if ((vec_flags & VEC_SVE_DATA)
13325            && const_vec_series_p (op, &base, &step))
13326     {
13327       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13328       if (!aarch64_sve_index_immediate_p (base)
13329           || !aarch64_sve_index_immediate_p (step))
13330         return false;
13331
13332       if (info)
13333         *info = simd_immediate_info (elt_mode, base, step);
13334       return true;
13335     }
13336   else if (GET_CODE (op) == CONST_VECTOR
13337            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13338     /* N_ELTS set above.  */;
13339   else
13340     return false;
13341
13342   /* Handle PFALSE and PTRUE.  */
13343   if (vec_flags & VEC_SVE_PRED)
13344     return (op == CONST0_RTX (mode)
13345             || op == CONSTM1_RTX (mode));
13346
13347   scalar_float_mode elt_float_mode;
13348   if (n_elts == 1
13349       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13350     {
13351       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13352       if (aarch64_float_const_zero_rtx_p (elt)
13353           || aarch64_float_const_representable_p (elt))
13354         {
13355           if (info)
13356             *info = simd_immediate_info (elt_float_mode, elt);
13357           return true;
13358         }
13359     }
13360
13361   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13362   if (elt_size > 8)
13363     return false;
13364
13365   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13366
13367   /* Expand the vector constant out into a byte vector, with the least
13368      significant byte of the register first.  */
13369   auto_vec<unsigned char, 16> bytes;
13370   bytes.reserve (n_elts * elt_size);
13371   for (unsigned int i = 0; i < n_elts; i++)
13372     {
13373       /* The vector is provided in gcc endian-neutral fashion.
13374          For aarch64_be Advanced SIMD, it must be laid out in the vector
13375          register in reverse order.  */
13376       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13377       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13378
13379       if (elt_mode != elt_int_mode)
13380         elt = gen_lowpart (elt_int_mode, elt);
13381
13382       if (!CONST_INT_P (elt))
13383         return false;
13384
13385       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13386       for (unsigned int byte = 0; byte < elt_size; byte++)
13387         {
13388           bytes.quick_push (elt_val & 0xff);
13389           elt_val >>= BITS_PER_UNIT;
13390         }
13391     }
13392
13393   /* The immediate must repeat every eight bytes.  */
13394   unsigned int nbytes = bytes.length ();
13395   for (unsigned i = 8; i < nbytes; ++i)
13396     if (bytes[i] != bytes[i - 8])
13397       return false;
13398
13399   /* Get the repeating 8-byte value as an integer.  No endian correction
13400      is needed here because bytes is already in lsb-first order.  */
13401   unsigned HOST_WIDE_INT val64 = 0;
13402   for (unsigned int i = 0; i < 8; i++)
13403     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13404               << (i * BITS_PER_UNIT));
13405
13406   if (vec_flags & VEC_SVE_DATA)
13407     return aarch64_sve_valid_immediate (val64, info);
13408   else
13409     return aarch64_advsimd_valid_immediate (val64, info, which);
13410 }
13411
13412 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13413    has a step in the range of INDEX.  Return the index expression if so,
13414    otherwise return null.  */
13415 rtx
13416 aarch64_check_zero_based_sve_index_immediate (rtx x)
13417 {
13418   rtx base, step;
13419   if (const_vec_series_p (x, &base, &step)
13420       && base == const0_rtx
13421       && aarch64_sve_index_immediate_p (step))
13422     return step;
13423   return NULL_RTX;
13424 }
13425
13426 /* Check of immediate shift constants are within range.  */
13427 bool
13428 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13429 {
13430   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13431   if (left)
13432     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13433   else
13434     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13435 }
13436
13437 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13438    operation of width WIDTH at bit position POS.  */
13439
13440 rtx
13441 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13442 {
13443   gcc_assert (CONST_INT_P (width));
13444   gcc_assert (CONST_INT_P (pos));
13445
13446   unsigned HOST_WIDE_INT mask
13447     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13448   return GEN_INT (mask << UINTVAL (pos));
13449 }
13450
13451 bool
13452 aarch64_mov_operand_p (rtx x, machine_mode mode)
13453 {
13454   if (GET_CODE (x) == HIGH
13455       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13456     return true;
13457
13458   if (CONST_INT_P (x))
13459     return true;
13460
13461   if (VECTOR_MODE_P (GET_MODE (x)))
13462     return aarch64_simd_valid_immediate (x, NULL);
13463
13464   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13465     return true;
13466
13467   if (aarch64_sve_cnt_immediate_p (x))
13468     return true;
13469
13470   return aarch64_classify_symbolic_expression (x)
13471     == SYMBOL_TINY_ABSOLUTE;
13472 }
13473
13474 /* Return a const_int vector of VAL.  */
13475 rtx
13476 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13477 {
13478   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13479   return gen_const_vec_duplicate (mode, c);
13480 }
13481
13482 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13483
13484 bool
13485 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13486 {
13487   machine_mode vmode;
13488
13489   vmode = aarch64_simd_container_mode (mode, 64);
13490   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13491   return aarch64_simd_valid_immediate (op_v, NULL);
13492 }
13493
13494 /* Construct and return a PARALLEL RTX vector with elements numbering the
13495    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13496    the vector - from the perspective of the architecture.  This does not
13497    line up with GCC's perspective on lane numbers, so we end up with
13498    different masks depending on our target endian-ness.  The diagram
13499    below may help.  We must draw the distinction when building masks
13500    which select one half of the vector.  An instruction selecting
13501    architectural low-lanes for a big-endian target, must be described using
13502    a mask selecting GCC high-lanes.
13503
13504                  Big-Endian             Little-Endian
13505
13506 GCC             0   1   2   3           3   2   1   0
13507               | x | x | x | x |       | x | x | x | x |
13508 Architecture    3   2   1   0           3   2   1   0
13509
13510 Low Mask:         { 2, 3 }                { 0, 1 }
13511 High Mask:        { 0, 1 }                { 2, 3 }
13512
13513    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13514
13515 rtx
13516 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13517 {
13518   rtvec v = rtvec_alloc (nunits / 2);
13519   int high_base = nunits / 2;
13520   int low_base = 0;
13521   int base;
13522   rtx t1;
13523   int i;
13524
13525   if (BYTES_BIG_ENDIAN)
13526     base = high ? low_base : high_base;
13527   else
13528     base = high ? high_base : low_base;
13529
13530   for (i = 0; i < nunits / 2; i++)
13531     RTVEC_ELT (v, i) = GEN_INT (base + i);
13532
13533   t1 = gen_rtx_PARALLEL (mode, v);
13534   return t1;
13535 }
13536
13537 /* Check OP for validity as a PARALLEL RTX vector with elements
13538    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13539    from the perspective of the architecture.  See the diagram above
13540    aarch64_simd_vect_par_cnst_half for more details.  */
13541
13542 bool
13543 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13544                                        bool high)
13545 {
13546   int nelts;
13547   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13548     return false;
13549
13550   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13551   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13552   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13553   int i = 0;
13554
13555   if (count_op != count_ideal)
13556     return false;
13557
13558   for (i = 0; i < count_ideal; i++)
13559     {
13560       rtx elt_op = XVECEXP (op, 0, i);
13561       rtx elt_ideal = XVECEXP (ideal, 0, i);
13562
13563       if (!CONST_INT_P (elt_op)
13564           || INTVAL (elt_ideal) != INTVAL (elt_op))
13565         return false;
13566     }
13567   return true;
13568 }
13569
13570 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13571    HIGH (exclusive).  */
13572 void
13573 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13574                           const_tree exp)
13575 {
13576   HOST_WIDE_INT lane;
13577   gcc_assert (CONST_INT_P (operand));
13578   lane = INTVAL (operand);
13579
13580   if (lane < low || lane >= high)
13581   {
13582     if (exp)
13583       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13584     else
13585       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13586   }
13587 }
13588
13589 /* Peform endian correction on lane number N, which indexes a vector
13590    of mode MODE, and return the result as an SImode rtx.  */
13591
13592 rtx
13593 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13594 {
13595   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13596 }
13597
13598 /* Return TRUE if OP is a valid vector addressing mode.  */
13599
13600 bool
13601 aarch64_simd_mem_operand_p (rtx op)
13602 {
13603   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13604                         || REG_P (XEXP (op, 0)));
13605 }
13606
13607 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13608
13609 bool
13610 aarch64_sve_ld1r_operand_p (rtx op)
13611 {
13612   struct aarch64_address_info addr;
13613   scalar_mode mode;
13614
13615   return (MEM_P (op)
13616           && is_a <scalar_mode> (GET_MODE (op), &mode)
13617           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13618           && addr.type == ADDRESS_REG_IMM
13619           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13620 }
13621
13622 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13623    The conditions for STR are the same.  */
13624 bool
13625 aarch64_sve_ldr_operand_p (rtx op)
13626 {
13627   struct aarch64_address_info addr;
13628
13629   return (MEM_P (op)
13630           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13631                                        false, ADDR_QUERY_ANY)
13632           && addr.type == ADDRESS_REG_IMM);
13633 }
13634
13635 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13636    We need to be able to access the individual pieces, so the range
13637    is different from LD[234] and ST[234].  */
13638 bool
13639 aarch64_sve_struct_memory_operand_p (rtx op)
13640 {
13641   if (!MEM_P (op))
13642     return false;
13643
13644   machine_mode mode = GET_MODE (op);
13645   struct aarch64_address_info addr;
13646   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13647                                  ADDR_QUERY_ANY)
13648       || addr.type != ADDRESS_REG_IMM)
13649     return false;
13650
13651   poly_int64 first = addr.const_offset;
13652   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13653   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13654           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13655 }
13656
13657 /* Emit a register copy from operand to operand, taking care not to
13658    early-clobber source registers in the process.
13659
13660    COUNT is the number of components into which the copy needs to be
13661    decomposed.  */
13662 void
13663 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13664                                 unsigned int count)
13665 {
13666   unsigned int i;
13667   int rdest = REGNO (operands[0]);
13668   int rsrc = REGNO (operands[1]);
13669
13670   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13671       || rdest < rsrc)
13672     for (i = 0; i < count; i++)
13673       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13674                       gen_rtx_REG (mode, rsrc + i));
13675   else
13676     for (i = 0; i < count; i++)
13677       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13678                       gen_rtx_REG (mode, rsrc + count - i - 1));
13679 }
13680
13681 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13682    one of VSTRUCT modes: OI, CI, or XI.  */
13683 int
13684 aarch64_simd_attr_length_rglist (machine_mode mode)
13685 {
13686   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13687   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13688 }
13689
13690 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13691    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13692    16 bits.  */
13693 static HOST_WIDE_INT
13694 aarch64_simd_vector_alignment (const_tree type)
13695 {
13696   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13697     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13698        be set for non-predicate vectors of booleans.  Modes are the most
13699        direct way we have of identifying real SVE predicate types.  */
13700     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13701   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13702   return MIN (align, 128);
13703 }
13704
13705 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13706 static HOST_WIDE_INT
13707 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13708 {
13709   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13710     {
13711       /* If the length of the vector is fixed, try to align to that length,
13712          otherwise don't try to align at all.  */
13713       HOST_WIDE_INT result;
13714       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13715         result = TYPE_ALIGN (TREE_TYPE (type));
13716       return result;
13717     }
13718   return TYPE_ALIGN (type);
13719 }
13720
13721 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13722 static bool
13723 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13724 {
13725   if (is_packed)
13726     return false;
13727
13728   /* For fixed-length vectors, check that the vectorizer will aim for
13729      full-vector alignment.  This isn't true for generic GCC vectors
13730      that are wider than the ABI maximum of 128 bits.  */
13731   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13732       && (wi::to_widest (TYPE_SIZE (type))
13733           != aarch64_vectorize_preferred_vector_alignment (type)))
13734     return false;
13735
13736   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13737   return true;
13738 }
13739
13740 /* Return true if the vector misalignment factor is supported by the
13741    target.  */
13742 static bool
13743 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13744                                              const_tree type, int misalignment,
13745                                              bool is_packed)
13746 {
13747   if (TARGET_SIMD && STRICT_ALIGNMENT)
13748     {
13749       /* Return if movmisalign pattern is not supported for this mode.  */
13750       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13751         return false;
13752
13753       /* Misalignment factor is unknown at compile time.  */
13754       if (misalignment == -1)
13755         return false;
13756     }
13757   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13758                                                       is_packed);
13759 }
13760
13761 /* If VALS is a vector constant that can be loaded into a register
13762    using DUP, generate instructions to do so and return an RTX to
13763    assign to the register.  Otherwise return NULL_RTX.  */
13764 static rtx
13765 aarch64_simd_dup_constant (rtx vals)
13766 {
13767   machine_mode mode = GET_MODE (vals);
13768   machine_mode inner_mode = GET_MODE_INNER (mode);
13769   rtx x;
13770
13771   if (!const_vec_duplicate_p (vals, &x))
13772     return NULL_RTX;
13773
13774   /* We can load this constant by using DUP and a constant in a
13775      single ARM register.  This will be cheaper than a vector
13776      load.  */
13777   x = copy_to_mode_reg (inner_mode, x);
13778   return gen_vec_duplicate (mode, x);
13779 }
13780
13781
13782 /* Generate code to load VALS, which is a PARALLEL containing only
13783    constants (for vec_init) or CONST_VECTOR, efficiently into a
13784    register.  Returns an RTX to copy into the register, or NULL_RTX
13785    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13786 static rtx
13787 aarch64_simd_make_constant (rtx vals)
13788 {
13789   machine_mode mode = GET_MODE (vals);
13790   rtx const_dup;
13791   rtx const_vec = NULL_RTX;
13792   int n_const = 0;
13793   int i;
13794
13795   if (GET_CODE (vals) == CONST_VECTOR)
13796     const_vec = vals;
13797   else if (GET_CODE (vals) == PARALLEL)
13798     {
13799       /* A CONST_VECTOR must contain only CONST_INTs and
13800          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13801          Only store valid constants in a CONST_VECTOR.  */
13802       int n_elts = XVECLEN (vals, 0);
13803       for (i = 0; i < n_elts; ++i)
13804         {
13805           rtx x = XVECEXP (vals, 0, i);
13806           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13807             n_const++;
13808         }
13809       if (n_const == n_elts)
13810         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13811     }
13812   else
13813     gcc_unreachable ();
13814
13815   if (const_vec != NULL_RTX
13816       && aarch64_simd_valid_immediate (const_vec, NULL))
13817     /* Load using MOVI/MVNI.  */
13818     return const_vec;
13819   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13820     /* Loaded using DUP.  */
13821     return const_dup;
13822   else if (const_vec != NULL_RTX)
13823     /* Load from constant pool. We can not take advantage of single-cycle
13824        LD1 because we need a PC-relative addressing mode.  */
13825     return const_vec;
13826   else
13827     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13828        We can not construct an initializer.  */
13829     return NULL_RTX;
13830 }
13831
13832 /* Expand a vector initialisation sequence, such that TARGET is
13833    initialised to contain VALS.  */
13834
13835 void
13836 aarch64_expand_vector_init (rtx target, rtx vals)
13837 {
13838   machine_mode mode = GET_MODE (target);
13839   scalar_mode inner_mode = GET_MODE_INNER (mode);
13840   /* The number of vector elements.  */
13841   int n_elts = XVECLEN (vals, 0);
13842   /* The number of vector elements which are not constant.  */
13843   int n_var = 0;
13844   rtx any_const = NULL_RTX;
13845   /* The first element of vals.  */
13846   rtx v0 = XVECEXP (vals, 0, 0);
13847   bool all_same = true;
13848
13849   /* Count the number of variable elements to initialise.  */
13850   for (int i = 0; i < n_elts; ++i)
13851     {
13852       rtx x = XVECEXP (vals, 0, i);
13853       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13854         ++n_var;
13855       else
13856         any_const = x;
13857
13858       all_same &= rtx_equal_p (x, v0);
13859     }
13860
13861   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13862      how best to handle this.  */
13863   if (n_var == 0)
13864     {
13865       rtx constant = aarch64_simd_make_constant (vals);
13866       if (constant != NULL_RTX)
13867         {
13868           emit_move_insn (target, constant);
13869           return;
13870         }
13871     }
13872
13873   /* Splat a single non-constant element if we can.  */
13874   if (all_same)
13875     {
13876       rtx x = copy_to_mode_reg (inner_mode, v0);
13877       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13878       return;
13879     }
13880
13881   enum insn_code icode = optab_handler (vec_set_optab, mode);
13882   gcc_assert (icode != CODE_FOR_nothing);
13883
13884   /* If there are only variable elements, try to optimize
13885      the insertion using dup for the most common element
13886      followed by insertions.  */
13887
13888   /* The algorithm will fill matches[*][0] with the earliest matching element,
13889      and matches[X][1] with the count of duplicate elements (if X is the
13890      earliest element which has duplicates).  */
13891
13892   if (n_var == n_elts && n_elts <= 16)
13893     {
13894       int matches[16][2] = {0};
13895       for (int i = 0; i < n_elts; i++)
13896         {
13897           for (int j = 0; j <= i; j++)
13898             {
13899               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13900                 {
13901                   matches[i][0] = j;
13902                   matches[j][1]++;
13903                   break;
13904                 }
13905             }
13906         }
13907       int maxelement = 0;
13908       int maxv = 0;
13909       for (int i = 0; i < n_elts; i++)
13910         if (matches[i][1] > maxv)
13911           {
13912             maxelement = i;
13913             maxv = matches[i][1];
13914           }
13915
13916       /* Create a duplicate of the most common element, unless all elements
13917          are equally useless to us, in which case just immediately set the
13918          vector register using the first element.  */
13919
13920       if (maxv == 1)
13921         {
13922           /* For vectors of two 64-bit elements, we can do even better.  */
13923           if (n_elts == 2
13924               && (inner_mode == E_DImode
13925                   || inner_mode == E_DFmode))
13926
13927             {
13928               rtx x0 = XVECEXP (vals, 0, 0);
13929               rtx x1 = XVECEXP (vals, 0, 1);
13930               /* Combine can pick up this case, but handling it directly
13931                  here leaves clearer RTL.
13932
13933                  This is load_pair_lanes<mode>, and also gives us a clean-up
13934                  for store_pair_lanes<mode>.  */
13935               if (memory_operand (x0, inner_mode)
13936                   && memory_operand (x1, inner_mode)
13937                   && !STRICT_ALIGNMENT
13938                   && rtx_equal_p (XEXP (x1, 0),
13939                                   plus_constant (Pmode,
13940                                                  XEXP (x0, 0),
13941                                                  GET_MODE_SIZE (inner_mode))))
13942                 {
13943                   rtx t;
13944                   if (inner_mode == DFmode)
13945                     t = gen_load_pair_lanesdf (target, x0, x1);
13946                   else
13947                     t = gen_load_pair_lanesdi (target, x0, x1);
13948                   emit_insn (t);
13949                   return;
13950                 }
13951             }
13952           /* The subreg-move sequence below will move into lane zero of the
13953              vector register.  For big-endian we want that position to hold
13954              the last element of VALS.  */
13955           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13956           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13957           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13958         }
13959       else
13960         {
13961           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13962           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13963         }
13964
13965       /* Insert the rest.  */
13966       for (int i = 0; i < n_elts; i++)
13967         {
13968           rtx x = XVECEXP (vals, 0, i);
13969           if (matches[i][0] == maxelement)
13970             continue;
13971           x = copy_to_mode_reg (inner_mode, x);
13972           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13973         }
13974       return;
13975     }
13976
13977   /* Initialise a vector which is part-variable.  We want to first try
13978      to build those lanes which are constant in the most efficient way we
13979      can.  */
13980   if (n_var != n_elts)
13981     {
13982       rtx copy = copy_rtx (vals);
13983
13984       /* Load constant part of vector.  We really don't care what goes into the
13985          parts we will overwrite, but we're more likely to be able to load the
13986          constant efficiently if it has fewer, larger, repeating parts
13987          (see aarch64_simd_valid_immediate).  */
13988       for (int i = 0; i < n_elts; i++)
13989         {
13990           rtx x = XVECEXP (vals, 0, i);
13991           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13992             continue;
13993           rtx subst = any_const;
13994           for (int bit = n_elts / 2; bit > 0; bit /= 2)
13995             {
13996               /* Look in the copied vector, as more elements are const.  */
13997               rtx test = XVECEXP (copy, 0, i ^ bit);
13998               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13999                 {
14000                   subst = test;
14001                   break;
14002                 }
14003             }
14004           XVECEXP (copy, 0, i) = subst;
14005         }
14006       aarch64_expand_vector_init (target, copy);
14007     }
14008
14009   /* Insert the variable lanes directly.  */
14010   for (int i = 0; i < n_elts; i++)
14011     {
14012       rtx x = XVECEXP (vals, 0, i);
14013       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14014         continue;
14015       x = copy_to_mode_reg (inner_mode, x);
14016       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14017     }
14018 }
14019
14020 static unsigned HOST_WIDE_INT
14021 aarch64_shift_truncation_mask (machine_mode mode)
14022 {
14023   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14024     return 0;
14025   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14026 }
14027
14028 /* Select a format to encode pointers in exception handling data.  */
14029 int
14030 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14031 {
14032    int type;
14033    switch (aarch64_cmodel)
14034      {
14035      case AARCH64_CMODEL_TINY:
14036      case AARCH64_CMODEL_TINY_PIC:
14037      case AARCH64_CMODEL_SMALL:
14038      case AARCH64_CMODEL_SMALL_PIC:
14039      case AARCH64_CMODEL_SMALL_SPIC:
14040        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14041           for everything.  */
14042        type = DW_EH_PE_sdata4;
14043        break;
14044      default:
14045        /* No assumptions here.  8-byte relocs required.  */
14046        type = DW_EH_PE_sdata8;
14047        break;
14048      }
14049    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14050 }
14051
14052 /* The last .arch and .tune assembly strings that we printed.  */
14053 static std::string aarch64_last_printed_arch_string;
14054 static std::string aarch64_last_printed_tune_string;
14055
14056 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14057    by the function fndecl.  */
14058
14059 void
14060 aarch64_declare_function_name (FILE *stream, const char* name,
14061                                 tree fndecl)
14062 {
14063   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14064
14065   struct cl_target_option *targ_options;
14066   if (target_parts)
14067     targ_options = TREE_TARGET_OPTION (target_parts);
14068   else
14069     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14070   gcc_assert (targ_options);
14071
14072   const struct processor *this_arch
14073     = aarch64_get_arch (targ_options->x_explicit_arch);
14074
14075   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14076   std::string extension
14077     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14078                                                   this_arch->flags);
14079   /* Only update the assembler .arch string if it is distinct from the last
14080      such string we printed.  */
14081   std::string to_print = this_arch->name + extension;
14082   if (to_print != aarch64_last_printed_arch_string)
14083     {
14084       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14085       aarch64_last_printed_arch_string = to_print;
14086     }
14087
14088   /* Print the cpu name we're tuning for in the comments, might be
14089      useful to readers of the generated asm.  Do it only when it changes
14090      from function to function and verbose assembly is requested.  */
14091   const struct processor *this_tune
14092     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14093
14094   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14095     {
14096       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14097                    this_tune->name);
14098       aarch64_last_printed_tune_string = this_tune->name;
14099     }
14100
14101   /* Don't forget the type directive for ELF.  */
14102   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14103   ASM_OUTPUT_LABEL (stream, name);
14104 }
14105
14106 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14107
14108 static void
14109 aarch64_start_file (void)
14110 {
14111   struct cl_target_option *default_options
14112     = TREE_TARGET_OPTION (target_option_default_node);
14113
14114   const struct processor *default_arch
14115     = aarch64_get_arch (default_options->x_explicit_arch);
14116   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14117   std::string extension
14118     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14119                                                   default_arch->flags);
14120
14121    aarch64_last_printed_arch_string = default_arch->name + extension;
14122    aarch64_last_printed_tune_string = "";
14123    asm_fprintf (asm_out_file, "\t.arch %s\n",
14124                 aarch64_last_printed_arch_string.c_str ());
14125
14126    default_file_start ();
14127 }
14128
14129 /* Emit load exclusive.  */
14130
14131 static void
14132 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14133                              rtx mem, rtx model_rtx)
14134 {
14135   rtx (*gen) (rtx, rtx, rtx);
14136
14137   switch (mode)
14138     {
14139     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14140     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14141     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14142     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14143     default:
14144       gcc_unreachable ();
14145     }
14146
14147   emit_insn (gen (rval, mem, model_rtx));
14148 }
14149
14150 /* Emit store exclusive.  */
14151
14152 static void
14153 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14154                               rtx rval, rtx mem, rtx model_rtx)
14155 {
14156   rtx (*gen) (rtx, rtx, rtx, rtx);
14157
14158   switch (mode)
14159     {
14160     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14161     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14162     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14163     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14164     default:
14165       gcc_unreachable ();
14166     }
14167
14168   emit_insn (gen (bval, rval, mem, model_rtx));
14169 }
14170
14171 /* Mark the previous jump instruction as unlikely.  */
14172
14173 static void
14174 aarch64_emit_unlikely_jump (rtx insn)
14175 {
14176   rtx_insn *jump = emit_jump_insn (insn);
14177   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14178 }
14179
14180 /* Expand a compare and swap pattern.  */
14181
14182 void
14183 aarch64_expand_compare_and_swap (rtx operands[])
14184 {
14185   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14186   machine_mode mode, cmp_mode;
14187   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14188   int idx;
14189   gen_cas_fn gen;
14190   const gen_cas_fn split_cas[] =
14191   {
14192     gen_aarch64_compare_and_swapqi,
14193     gen_aarch64_compare_and_swaphi,
14194     gen_aarch64_compare_and_swapsi,
14195     gen_aarch64_compare_and_swapdi
14196   };
14197   const gen_cas_fn atomic_cas[] =
14198   {
14199     gen_aarch64_compare_and_swapqi_lse,
14200     gen_aarch64_compare_and_swaphi_lse,
14201     gen_aarch64_compare_and_swapsi_lse,
14202     gen_aarch64_compare_and_swapdi_lse
14203   };
14204
14205   bval = operands[0];
14206   rval = operands[1];
14207   mem = operands[2];
14208   oldval = operands[3];
14209   newval = operands[4];
14210   is_weak = operands[5];
14211   mod_s = operands[6];
14212   mod_f = operands[7];
14213   mode = GET_MODE (mem);
14214   cmp_mode = mode;
14215
14216   /* Normally the succ memory model must be stronger than fail, but in the
14217      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14218      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14219
14220   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14221       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14222     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14223
14224   switch (mode)
14225     {
14226     case E_QImode:
14227     case E_HImode:
14228       /* For short modes, we're going to perform the comparison in SImode,
14229          so do the zero-extension now.  */
14230       cmp_mode = SImode;
14231       rval = gen_reg_rtx (SImode);
14232       oldval = convert_modes (SImode, mode, oldval, true);
14233       /* Fall through.  */
14234
14235     case E_SImode:
14236     case E_DImode:
14237       /* Force the value into a register if needed.  */
14238       if (!aarch64_plus_operand (oldval, mode))
14239         oldval = force_reg (cmp_mode, oldval);
14240       break;
14241
14242     default:
14243       gcc_unreachable ();
14244     }
14245
14246   switch (mode)
14247     {
14248     case E_QImode: idx = 0; break;
14249     case E_HImode: idx = 1; break;
14250     case E_SImode: idx = 2; break;
14251     case E_DImode: idx = 3; break;
14252     default:
14253       gcc_unreachable ();
14254     }
14255   if (TARGET_LSE)
14256     gen = atomic_cas[idx];
14257   else
14258     gen = split_cas[idx];
14259
14260   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14261
14262   if (mode == QImode || mode == HImode)
14263     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14264
14265   x = gen_rtx_REG (CCmode, CC_REGNUM);
14266   x = gen_rtx_EQ (SImode, x, const0_rtx);
14267   emit_insn (gen_rtx_SET (bval, x));
14268 }
14269
14270 /* Test whether the target supports using a atomic load-operate instruction.
14271    CODE is the operation and AFTER is TRUE if the data in memory after the
14272    operation should be returned and FALSE if the data before the operation
14273    should be returned.  Returns FALSE if the operation isn't supported by the
14274    architecture.  */
14275
14276 bool
14277 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14278 {
14279   if (!TARGET_LSE)
14280     return false;
14281
14282   switch (code)
14283     {
14284     case SET:
14285     case AND:
14286     case IOR:
14287     case XOR:
14288     case MINUS:
14289     case PLUS:
14290       return true;
14291     default:
14292       return false;
14293     }
14294 }
14295
14296 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14297    sequence implementing an atomic operation.  */
14298
14299 static void
14300 aarch64_emit_post_barrier (enum memmodel model)
14301 {
14302   const enum memmodel base_model = memmodel_base (model);
14303
14304   if (is_mm_sync (model)
14305       && (base_model == MEMMODEL_ACQUIRE
14306           || base_model == MEMMODEL_ACQ_REL
14307           || base_model == MEMMODEL_SEQ_CST))
14308     {
14309       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14310     }
14311 }
14312
14313 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14314    for the data in memory.  EXPECTED is the value expected to be in memory.
14315    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14316    is the memory ordering to use.  */
14317
14318 void
14319 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14320                         rtx expected, rtx desired,
14321                         rtx model)
14322 {
14323   rtx (*gen) (rtx, rtx, rtx, rtx);
14324   machine_mode mode;
14325
14326   mode = GET_MODE (mem);
14327
14328   switch (mode)
14329     {
14330     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14331     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14332     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14333     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14334     default:
14335       gcc_unreachable ();
14336     }
14337
14338   /* Move the expected value into the CAS destination register.  */
14339   emit_insn (gen_rtx_SET (rval, expected));
14340
14341   /* Emit the CAS.  */
14342   emit_insn (gen (rval, mem, desired, model));
14343
14344   /* Compare the expected value with the value loaded by the CAS, to establish
14345      whether the swap was made.  */
14346   aarch64_gen_compare_reg (EQ, rval, expected);
14347 }
14348
14349 /* Split a compare and swap pattern.  */
14350
14351 void
14352 aarch64_split_compare_and_swap (rtx operands[])
14353 {
14354   rtx rval, mem, oldval, newval, scratch;
14355   machine_mode mode;
14356   bool is_weak;
14357   rtx_code_label *label1, *label2;
14358   rtx x, cond;
14359   enum memmodel model;
14360   rtx model_rtx;
14361
14362   rval = operands[0];
14363   mem = operands[1];
14364   oldval = operands[2];
14365   newval = operands[3];
14366   is_weak = (operands[4] != const0_rtx);
14367   model_rtx = operands[5];
14368   scratch = operands[7];
14369   mode = GET_MODE (mem);
14370   model = memmodel_from_int (INTVAL (model_rtx));
14371
14372   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14373     loop:
14374     .label1:
14375         LD[A]XR rval, [mem]
14376         CBNZ    rval, .label2
14377         ST[L]XR scratch, newval, [mem]
14378         CBNZ    scratch, .label1
14379     .label2:
14380         CMP     rval, 0.  */
14381   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14382
14383   label1 = NULL;
14384   if (!is_weak)
14385     {
14386       label1 = gen_label_rtx ();
14387       emit_label (label1);
14388     }
14389   label2 = gen_label_rtx ();
14390
14391   /* The initial load can be relaxed for a __sync operation since a final
14392      barrier will be emitted to stop code hoisting.  */
14393   if (is_mm_sync (model))
14394     aarch64_emit_load_exclusive (mode, rval, mem,
14395                                  GEN_INT (MEMMODEL_RELAXED));
14396   else
14397     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14398
14399   if (strong_zero_p)
14400     {
14401       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14402       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14403                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14404       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14405     }
14406   else
14407     {
14408       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14409       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14410       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14411                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14412       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14413     }
14414
14415   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14416
14417   if (!is_weak)
14418     {
14419       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14420       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14421                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14422       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14423     }
14424   else
14425     {
14426       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14427       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14428       emit_insn (gen_rtx_SET (cond, x));
14429     }
14430
14431   emit_label (label2);
14432   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14433      to set the condition flags.  If this is not used it will be removed by
14434      later passes.  */
14435   if (strong_zero_p)
14436     {
14437       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14438       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14439       emit_insn (gen_rtx_SET (cond, x));
14440     }
14441   /* Emit any final barrier needed for a __sync operation.  */
14442   if (is_mm_sync (model))
14443     aarch64_emit_post_barrier (model);
14444 }
14445
14446 /* Emit a BIC instruction.  */
14447
14448 static void
14449 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14450 {
14451   rtx shift_rtx = GEN_INT (shift);
14452   rtx (*gen) (rtx, rtx, rtx, rtx);
14453
14454   switch (mode)
14455     {
14456     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14457     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14458     default:
14459       gcc_unreachable ();
14460     }
14461
14462   emit_insn (gen (dst, s2, shift_rtx, s1));
14463 }
14464
14465 /* Emit an atomic swap.  */
14466
14467 static void
14468 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14469                           rtx mem, rtx model)
14470 {
14471   rtx (*gen) (rtx, rtx, rtx, rtx);
14472
14473   switch (mode)
14474     {
14475     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14476     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14477     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14478     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14479     default:
14480       gcc_unreachable ();
14481     }
14482
14483   emit_insn (gen (dst, mem, value, model));
14484 }
14485
14486 /* Operations supported by aarch64_emit_atomic_load_op.  */
14487
14488 enum aarch64_atomic_load_op_code
14489 {
14490   AARCH64_LDOP_PLUS,    /* A + B  */
14491   AARCH64_LDOP_XOR,     /* A ^ B  */
14492   AARCH64_LDOP_OR,      /* A | B  */
14493   AARCH64_LDOP_BIC      /* A & ~B  */
14494 };
14495
14496 /* Emit an atomic load-operate.  */
14497
14498 static void
14499 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14500                              machine_mode mode, rtx dst, rtx src,
14501                              rtx mem, rtx model)
14502 {
14503   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14504   const aarch64_atomic_load_op_fn plus[] =
14505   {
14506     gen_aarch64_atomic_loadaddqi,
14507     gen_aarch64_atomic_loadaddhi,
14508     gen_aarch64_atomic_loadaddsi,
14509     gen_aarch64_atomic_loadadddi
14510   };
14511   const aarch64_atomic_load_op_fn eor[] =
14512   {
14513     gen_aarch64_atomic_loadeorqi,
14514     gen_aarch64_atomic_loadeorhi,
14515     gen_aarch64_atomic_loadeorsi,
14516     gen_aarch64_atomic_loadeordi
14517   };
14518   const aarch64_atomic_load_op_fn ior[] =
14519   {
14520     gen_aarch64_atomic_loadsetqi,
14521     gen_aarch64_atomic_loadsethi,
14522     gen_aarch64_atomic_loadsetsi,
14523     gen_aarch64_atomic_loadsetdi
14524   };
14525   const aarch64_atomic_load_op_fn bic[] =
14526   {
14527     gen_aarch64_atomic_loadclrqi,
14528     gen_aarch64_atomic_loadclrhi,
14529     gen_aarch64_atomic_loadclrsi,
14530     gen_aarch64_atomic_loadclrdi
14531   };
14532   aarch64_atomic_load_op_fn gen;
14533   int idx = 0;
14534
14535   switch (mode)
14536     {
14537     case E_QImode: idx = 0; break;
14538     case E_HImode: idx = 1; break;
14539     case E_SImode: idx = 2; break;
14540     case E_DImode: idx = 3; break;
14541     default:
14542       gcc_unreachable ();
14543     }
14544
14545   switch (code)
14546     {
14547     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14548     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14549     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14550     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14551     default:
14552       gcc_unreachable ();
14553     }
14554
14555   emit_insn (gen (dst, mem, src, model));
14556 }
14557
14558 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14559    location to store the data read from memory.  OUT_RESULT is the location to
14560    store the result of the operation.  MEM is the memory location to read and
14561    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14562    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14563    be NULL.  */
14564
14565 void
14566 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14567                          rtx mem, rtx value, rtx model_rtx)
14568 {
14569   machine_mode mode = GET_MODE (mem);
14570   machine_mode wmode = (mode == DImode ? DImode : SImode);
14571   const bool short_mode = (mode < SImode);
14572   aarch64_atomic_load_op_code ldop_code;
14573   rtx src;
14574   rtx x;
14575
14576   if (out_data)
14577     out_data = gen_lowpart (mode, out_data);
14578
14579   if (out_result)
14580     out_result = gen_lowpart (mode, out_result);
14581
14582   /* Make sure the value is in a register, putting it into a destination
14583      register if it needs to be manipulated.  */
14584   if (!register_operand (value, mode)
14585       || code == AND || code == MINUS)
14586     {
14587       src = out_result ? out_result : out_data;
14588       emit_move_insn (src, gen_lowpart (mode, value));
14589     }
14590   else
14591     src = value;
14592   gcc_assert (register_operand (src, mode));
14593
14594   /* Preprocess the data for the operation as necessary.  If the operation is
14595      a SET then emit a swap instruction and finish.  */
14596   switch (code)
14597     {
14598     case SET:
14599       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14600       return;
14601
14602     case MINUS:
14603       /* Negate the value and treat it as a PLUS.  */
14604       {
14605         rtx neg_src;
14606
14607         /* Resize the value if necessary.  */
14608         if (short_mode)
14609           src = gen_lowpart (wmode, src);
14610
14611         neg_src = gen_rtx_NEG (wmode, src);
14612         emit_insn (gen_rtx_SET (src, neg_src));
14613
14614         if (short_mode)
14615           src = gen_lowpart (mode, src);
14616       }
14617       /* Fall-through.  */
14618     case PLUS:
14619       ldop_code = AARCH64_LDOP_PLUS;
14620       break;
14621
14622     case IOR:
14623       ldop_code = AARCH64_LDOP_OR;
14624       break;
14625
14626     case XOR:
14627       ldop_code = AARCH64_LDOP_XOR;
14628       break;
14629
14630     case AND:
14631       {
14632         rtx not_src;
14633
14634         /* Resize the value if necessary.  */
14635         if (short_mode)
14636           src = gen_lowpart (wmode, src);
14637
14638         not_src = gen_rtx_NOT (wmode, src);
14639         emit_insn (gen_rtx_SET (src, not_src));
14640
14641         if (short_mode)
14642           src = gen_lowpart (mode, src);
14643       }
14644       ldop_code = AARCH64_LDOP_BIC;
14645       break;
14646
14647     default:
14648       /* The operation can't be done with atomic instructions.  */
14649       gcc_unreachable ();
14650     }
14651
14652   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14653
14654   /* If necessary, calculate the data in memory after the update by redoing the
14655      operation from values in registers.  */
14656   if (!out_result)
14657     return;
14658
14659   if (short_mode)
14660     {
14661       src = gen_lowpart (wmode, src);
14662       out_data = gen_lowpart (wmode, out_data);
14663       out_result = gen_lowpart (wmode, out_result);
14664     }
14665
14666   x = NULL_RTX;
14667
14668   switch (code)
14669     {
14670     case MINUS:
14671     case PLUS:
14672       x = gen_rtx_PLUS (wmode, out_data, src);
14673       break;
14674     case IOR:
14675       x = gen_rtx_IOR (wmode, out_data, src);
14676       break;
14677     case XOR:
14678       x = gen_rtx_XOR (wmode, out_data, src);
14679       break;
14680     case AND:
14681       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14682       return;
14683     default:
14684       gcc_unreachable ();
14685     }
14686
14687   emit_set_insn (out_result, x);
14688
14689   return;
14690 }
14691
14692 /* Split an atomic operation.  */
14693
14694 void
14695 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14696                          rtx value, rtx model_rtx, rtx cond)
14697 {
14698   machine_mode mode = GET_MODE (mem);
14699   machine_mode wmode = (mode == DImode ? DImode : SImode);
14700   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14701   const bool is_sync = is_mm_sync (model);
14702   rtx_code_label *label;
14703   rtx x;
14704
14705   /* Split the atomic operation into a sequence.  */
14706   label = gen_label_rtx ();
14707   emit_label (label);
14708
14709   if (new_out)
14710     new_out = gen_lowpart (wmode, new_out);
14711   if (old_out)
14712     old_out = gen_lowpart (wmode, old_out);
14713   else
14714     old_out = new_out;
14715   value = simplify_gen_subreg (wmode, value, mode, 0);
14716
14717   /* The initial load can be relaxed for a __sync operation since a final
14718      barrier will be emitted to stop code hoisting.  */
14719  if (is_sync)
14720     aarch64_emit_load_exclusive (mode, old_out, mem,
14721                                  GEN_INT (MEMMODEL_RELAXED));
14722   else
14723     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14724
14725   switch (code)
14726     {
14727     case SET:
14728       new_out = value;
14729       break;
14730
14731     case NOT:
14732       x = gen_rtx_AND (wmode, old_out, value);
14733       emit_insn (gen_rtx_SET (new_out, x));
14734       x = gen_rtx_NOT (wmode, new_out);
14735       emit_insn (gen_rtx_SET (new_out, x));
14736       break;
14737
14738     case MINUS:
14739       if (CONST_INT_P (value))
14740         {
14741           value = GEN_INT (-INTVAL (value));
14742           code = PLUS;
14743         }
14744       /* Fall through.  */
14745
14746     default:
14747       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14748       emit_insn (gen_rtx_SET (new_out, x));
14749       break;
14750     }
14751
14752   aarch64_emit_store_exclusive (mode, cond, mem,
14753                                 gen_lowpart (mode, new_out), model_rtx);
14754
14755   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14756   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14757                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14758   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14759
14760   /* Emit any final barrier needed for a __sync operation.  */
14761   if (is_sync)
14762     aarch64_emit_post_barrier (model);
14763 }
14764
14765 static void
14766 aarch64_init_libfuncs (void)
14767 {
14768    /* Half-precision float operations.  The compiler handles all operations
14769      with NULL libfuncs by converting to SFmode.  */
14770
14771   /* Conversions.  */
14772   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14773   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14774
14775   /* Arithmetic.  */
14776   set_optab_libfunc (add_optab, HFmode, NULL);
14777   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14778   set_optab_libfunc (smul_optab, HFmode, NULL);
14779   set_optab_libfunc (neg_optab, HFmode, NULL);
14780   set_optab_libfunc (sub_optab, HFmode, NULL);
14781
14782   /* Comparisons.  */
14783   set_optab_libfunc (eq_optab, HFmode, NULL);
14784   set_optab_libfunc (ne_optab, HFmode, NULL);
14785   set_optab_libfunc (lt_optab, HFmode, NULL);
14786   set_optab_libfunc (le_optab, HFmode, NULL);
14787   set_optab_libfunc (ge_optab, HFmode, NULL);
14788   set_optab_libfunc (gt_optab, HFmode, NULL);
14789   set_optab_libfunc (unord_optab, HFmode, NULL);
14790 }
14791
14792 /* Target hook for c_mode_for_suffix.  */
14793 static machine_mode
14794 aarch64_c_mode_for_suffix (char suffix)
14795 {
14796   if (suffix == 'q')
14797     return TFmode;
14798
14799   return VOIDmode;
14800 }
14801
14802 /* We can only represent floating point constants which will fit in
14803    "quarter-precision" values.  These values are characterised by
14804    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14805    by:
14806
14807    (-1)^s * (n/16) * 2^r
14808
14809    Where:
14810      's' is the sign bit.
14811      'n' is an integer in the range 16 <= n <= 31.
14812      'r' is an integer in the range -3 <= r <= 4.  */
14813
14814 /* Return true iff X can be represented by a quarter-precision
14815    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14816 bool
14817 aarch64_float_const_representable_p (rtx x)
14818 {
14819   /* This represents our current view of how many bits
14820      make up the mantissa.  */
14821   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14822   int exponent;
14823   unsigned HOST_WIDE_INT mantissa, mask;
14824   REAL_VALUE_TYPE r, m;
14825   bool fail;
14826
14827   if (!CONST_DOUBLE_P (x))
14828     return false;
14829
14830   /* We don't support HFmode constants yet.  */
14831   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14832     return false;
14833
14834   r = *CONST_DOUBLE_REAL_VALUE (x);
14835
14836   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14837      know if we have +zero until we analyse the mantissa, but we
14838      can reject the other invalid values.  */
14839   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14840       || REAL_VALUE_MINUS_ZERO (r))
14841     return false;
14842
14843   /* Extract exponent.  */
14844   r = real_value_abs (&r);
14845   exponent = REAL_EXP (&r);
14846
14847   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14848      highest (sign) bit, with a fixed binary point at bit point_pos.
14849      m1 holds the low part of the mantissa, m2 the high part.
14850      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14851      bits for the mantissa, this can fail (low bits will be lost).  */
14852   real_ldexp (&m, &r, point_pos - exponent);
14853   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14854
14855   /* If the low part of the mantissa has bits set we cannot represent
14856      the value.  */
14857   if (w.ulow () != 0)
14858     return false;
14859   /* We have rejected the lower HOST_WIDE_INT, so update our
14860      understanding of how many bits lie in the mantissa and
14861      look only at the high HOST_WIDE_INT.  */
14862   mantissa = w.elt (1);
14863   point_pos -= HOST_BITS_PER_WIDE_INT;
14864
14865   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14866   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14867   if ((mantissa & mask) != 0)
14868     return false;
14869
14870   /* Having filtered unrepresentable values, we may now remove all
14871      but the highest 5 bits.  */
14872   mantissa >>= point_pos - 5;
14873
14874   /* We cannot represent the value 0.0, so reject it.  This is handled
14875      elsewhere.  */
14876   if (mantissa == 0)
14877     return false;
14878
14879   /* Then, as bit 4 is always set, we can mask it off, leaving
14880      the mantissa in the range [0, 15].  */
14881   mantissa &= ~(1 << 4);
14882   gcc_assert (mantissa <= 15);
14883
14884   /* GCC internally does not use IEEE754-like encoding (where normalized
14885      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14886      Our mantissa values are shifted 4 places to the left relative to
14887      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14888      by 5 places to correct for GCC's representation.  */
14889   exponent = 5 - exponent;
14890
14891   return (exponent >= 0 && exponent <= 7);
14892 }
14893
14894 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14895    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14896    output MOVI/MVNI, ORR or BIC immediate.  */
14897 char*
14898 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14899                                    enum simd_immediate_check which)
14900 {
14901   bool is_valid;
14902   static char templ[40];
14903   const char *mnemonic;
14904   const char *shift_op;
14905   unsigned int lane_count = 0;
14906   char element_char;
14907
14908   struct simd_immediate_info info;
14909
14910   /* This will return true to show const_vector is legal for use as either
14911      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14912      It will also update INFO to show how the immediate should be generated.
14913      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14914   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14915   gcc_assert (is_valid);
14916
14917   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14918   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14919
14920   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14921     {
14922       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14923       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14924          move immediate path.  */
14925       if (aarch64_float_const_zero_rtx_p (info.value))
14926         info.value = GEN_INT (0);
14927       else
14928         {
14929           const unsigned int buf_size = 20;
14930           char float_buf[buf_size] = {'\0'};
14931           real_to_decimal_for_mode (float_buf,
14932                                     CONST_DOUBLE_REAL_VALUE (info.value),
14933                                     buf_size, buf_size, 1, info.elt_mode);
14934
14935           if (lane_count == 1)
14936             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14937           else
14938             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14939                       lane_count, element_char, float_buf);
14940           return templ;
14941         }
14942     }
14943
14944   gcc_assert (CONST_INT_P (info.value));
14945
14946   if (which == AARCH64_CHECK_MOV)
14947     {
14948       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14949       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14950       if (lane_count == 1)
14951         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14952                   mnemonic, UINTVAL (info.value));
14953       else if (info.shift)
14954         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14955                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14956                   element_char, UINTVAL (info.value), shift_op, info.shift);
14957       else
14958         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14959                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14960                   element_char, UINTVAL (info.value));
14961     }
14962   else
14963     {
14964       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14965       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14966       if (info.shift)
14967         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14968                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14969                   element_char, UINTVAL (info.value), "lsl", info.shift);
14970       else
14971         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14972                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14973                   element_char, UINTVAL (info.value));
14974     }
14975   return templ;
14976 }
14977
14978 char*
14979 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14980 {
14981
14982   /* If a floating point number was passed and we desire to use it in an
14983      integer mode do the conversion to integer.  */
14984   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14985     {
14986       unsigned HOST_WIDE_INT ival;
14987       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14988           gcc_unreachable ();
14989       immediate = gen_int_mode (ival, mode);
14990     }
14991
14992   machine_mode vmode;
14993   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14994      a 128 bit vector mode.  */
14995   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14996
14997   vmode = aarch64_simd_container_mode (mode, width);
14998   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14999   return aarch64_output_simd_mov_immediate (v_op, width);
15000 }
15001
15002 /* Return the output string to use for moving immediate CONST_VECTOR
15003    into an SVE register.  */
15004
15005 char *
15006 aarch64_output_sve_mov_immediate (rtx const_vector)
15007 {
15008   static char templ[40];
15009   struct simd_immediate_info info;
15010   char element_char;
15011
15012   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15013   gcc_assert (is_valid);
15014
15015   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15016
15017   if (info.step)
15018     {
15019       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15020                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15021                 element_char, INTVAL (info.value), INTVAL (info.step));
15022       return templ;
15023     }
15024
15025   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15026     {
15027       if (aarch64_float_const_zero_rtx_p (info.value))
15028         info.value = GEN_INT (0);
15029       else
15030         {
15031           const int buf_size = 20;
15032           char float_buf[buf_size] = {};
15033           real_to_decimal_for_mode (float_buf,
15034                                     CONST_DOUBLE_REAL_VALUE (info.value),
15035                                     buf_size, buf_size, 1, info.elt_mode);
15036
15037           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15038                     element_char, float_buf);
15039           return templ;
15040         }
15041     }
15042
15043   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15044             element_char, INTVAL (info.value));
15045   return templ;
15046 }
15047
15048 /* Return the asm format for a PTRUE instruction whose destination has
15049    mode MODE.  SUFFIX is the element size suffix.  */
15050
15051 char *
15052 aarch64_output_ptrue (machine_mode mode, char suffix)
15053 {
15054   unsigned int nunits;
15055   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15056   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15057     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15058   else
15059     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15060   return buf;
15061 }
15062
15063 /* Split operands into moves from op[1] + op[2] into op[0].  */
15064
15065 void
15066 aarch64_split_combinev16qi (rtx operands[3])
15067 {
15068   unsigned int dest = REGNO (operands[0]);
15069   unsigned int src1 = REGNO (operands[1]);
15070   unsigned int src2 = REGNO (operands[2]);
15071   machine_mode halfmode = GET_MODE (operands[1]);
15072   unsigned int halfregs = REG_NREGS (operands[1]);
15073   rtx destlo, desthi;
15074
15075   gcc_assert (halfmode == V16QImode);
15076
15077   if (src1 == dest && src2 == dest + halfregs)
15078     {
15079       /* No-op move.  Can't split to nothing; emit something.  */
15080       emit_note (NOTE_INSN_DELETED);
15081       return;
15082     }
15083
15084   /* Preserve register attributes for variable tracking.  */
15085   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15086   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15087                                GET_MODE_SIZE (halfmode));
15088
15089   /* Special case of reversed high/low parts.  */
15090   if (reg_overlap_mentioned_p (operands[2], destlo)
15091       && reg_overlap_mentioned_p (operands[1], desthi))
15092     {
15093       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15094       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15095       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15096     }
15097   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15098     {
15099       /* Try to avoid unnecessary moves if part of the result
15100          is in the right place already.  */
15101       if (src1 != dest)
15102         emit_move_insn (destlo, operands[1]);
15103       if (src2 != dest + halfregs)
15104         emit_move_insn (desthi, operands[2]);
15105     }
15106   else
15107     {
15108       if (src2 != dest + halfregs)
15109         emit_move_insn (desthi, operands[2]);
15110       if (src1 != dest)
15111         emit_move_insn (destlo, operands[1]);
15112     }
15113 }
15114
15115 /* vec_perm support.  */
15116
15117 struct expand_vec_perm_d
15118 {
15119   rtx target, op0, op1;
15120   vec_perm_indices perm;
15121   machine_mode vmode;
15122   unsigned int vec_flags;
15123   bool one_vector_p;
15124   bool testing_p;
15125 };
15126
15127 /* Generate a variable permutation.  */
15128
15129 static void
15130 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15131 {
15132   machine_mode vmode = GET_MODE (target);
15133   bool one_vector_p = rtx_equal_p (op0, op1);
15134
15135   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15136   gcc_checking_assert (GET_MODE (op0) == vmode);
15137   gcc_checking_assert (GET_MODE (op1) == vmode);
15138   gcc_checking_assert (GET_MODE (sel) == vmode);
15139   gcc_checking_assert (TARGET_SIMD);
15140
15141   if (one_vector_p)
15142     {
15143       if (vmode == V8QImode)
15144         {
15145           /* Expand the argument to a V16QI mode by duplicating it.  */
15146           rtx pair = gen_reg_rtx (V16QImode);
15147           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15148           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15149         }
15150       else
15151         {
15152           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15153         }
15154     }
15155   else
15156     {
15157       rtx pair;
15158
15159       if (vmode == V8QImode)
15160         {
15161           pair = gen_reg_rtx (V16QImode);
15162           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15163           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15164         }
15165       else
15166         {
15167           pair = gen_reg_rtx (OImode);
15168           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15169           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15170         }
15171     }
15172 }
15173
15174 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15175    NELT is the number of elements in the vector.  */
15176
15177 void
15178 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15179                          unsigned int nelt)
15180 {
15181   machine_mode vmode = GET_MODE (target);
15182   bool one_vector_p = rtx_equal_p (op0, op1);
15183   rtx mask;
15184
15185   /* The TBL instruction does not use a modulo index, so we must take care
15186      of that ourselves.  */
15187   mask = aarch64_simd_gen_const_vector_dup (vmode,
15188       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15189   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15190
15191   /* For big-endian, we also need to reverse the index within the vector
15192      (but not which vector).  */
15193   if (BYTES_BIG_ENDIAN)
15194     {
15195       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15196       if (!one_vector_p)
15197         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15198       sel = expand_simple_binop (vmode, XOR, sel, mask,
15199                                  NULL, 0, OPTAB_LIB_WIDEN);
15200     }
15201   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15202 }
15203
15204 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15205
15206 static void
15207 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15208 {
15209   emit_insn (gen_rtx_SET (target,
15210                           gen_rtx_UNSPEC (GET_MODE (target),
15211                                           gen_rtvec (2, op0, op1), code)));
15212 }
15213
15214 /* Expand an SVE vec_perm with the given operands.  */
15215
15216 void
15217 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15218 {
15219   machine_mode data_mode = GET_MODE (target);
15220   machine_mode sel_mode = GET_MODE (sel);
15221   /* Enforced by the pattern condition.  */
15222   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15223
15224   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15225      size of the two value vectors, i.e. the upper bits of the indices
15226      are effectively ignored.  SVE TBL instead produces 0 for any
15227      out-of-range indices, so we need to modulo all the vec_perm indices
15228      to ensure they are all in range.  */
15229   rtx sel_reg = force_reg (sel_mode, sel);
15230
15231   /* Check if the sel only references the first values vector.  */
15232   if (GET_CODE (sel) == CONST_VECTOR
15233       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15234     {
15235       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15236       return;
15237     }
15238
15239   /* Check if the two values vectors are the same.  */
15240   if (rtx_equal_p (op0, op1))
15241     {
15242       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15243       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15244                                          NULL, 0, OPTAB_DIRECT);
15245       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15246       return;
15247     }
15248
15249   /* Run TBL on for each value vector and combine the results.  */
15250
15251   rtx res0 = gen_reg_rtx (data_mode);
15252   rtx res1 = gen_reg_rtx (data_mode);
15253   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15254   if (GET_CODE (sel) != CONST_VECTOR
15255       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15256     {
15257       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15258                                                        2 * nunits - 1);
15259       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15260                                      NULL, 0, OPTAB_DIRECT);
15261     }
15262   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15263   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15264                                      NULL, 0, OPTAB_DIRECT);
15265   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15266   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15267     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15268   else
15269     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15270 }
15271
15272 /* Recognize patterns suitable for the TRN instructions.  */
15273 static bool
15274 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15275 {
15276   HOST_WIDE_INT odd;
15277   poly_uint64 nelt = d->perm.length ();
15278   rtx out, in0, in1, x;
15279   machine_mode vmode = d->vmode;
15280
15281   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15282     return false;
15283
15284   /* Note that these are little-endian tests.
15285      We correct for big-endian later.  */
15286   if (!d->perm[0].is_constant (&odd)
15287       || (odd != 0 && odd != 1)
15288       || !d->perm.series_p (0, 2, odd, 2)
15289       || !d->perm.series_p (1, 2, nelt + odd, 2))
15290     return false;
15291
15292   /* Success!  */
15293   if (d->testing_p)
15294     return true;
15295
15296   in0 = d->op0;
15297   in1 = d->op1;
15298   /* We don't need a big-endian lane correction for SVE; see the comment
15299      at the head of aarch64-sve.md for details.  */
15300   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15301     {
15302       x = in0, in0 = in1, in1 = x;
15303       odd = !odd;
15304     }
15305   out = d->target;
15306
15307   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15308                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15309   return true;
15310 }
15311
15312 /* Recognize patterns suitable for the UZP instructions.  */
15313 static bool
15314 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15315 {
15316   HOST_WIDE_INT odd;
15317   rtx out, in0, in1, x;
15318   machine_mode vmode = d->vmode;
15319
15320   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15321     return false;
15322
15323   /* Note that these are little-endian tests.
15324      We correct for big-endian later.  */
15325   if (!d->perm[0].is_constant (&odd)
15326       || (odd != 0 && odd != 1)
15327       || !d->perm.series_p (0, 1, odd, 2))
15328     return false;
15329
15330   /* Success!  */
15331   if (d->testing_p)
15332     return true;
15333
15334   in0 = d->op0;
15335   in1 = d->op1;
15336   /* We don't need a big-endian lane correction for SVE; see the comment
15337      at the head of aarch64-sve.md for details.  */
15338   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15339     {
15340       x = in0, in0 = in1, in1 = x;
15341       odd = !odd;
15342     }
15343   out = d->target;
15344
15345   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15346                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15347   return true;
15348 }
15349
15350 /* Recognize patterns suitable for the ZIP instructions.  */
15351 static bool
15352 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15353 {
15354   unsigned int high;
15355   poly_uint64 nelt = d->perm.length ();
15356   rtx out, in0, in1, x;
15357   machine_mode vmode = d->vmode;
15358
15359   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15360     return false;
15361
15362   /* Note that these are little-endian tests.
15363      We correct for big-endian later.  */
15364   poly_uint64 first = d->perm[0];
15365   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15366       || !d->perm.series_p (0, 2, first, 1)
15367       || !d->perm.series_p (1, 2, first + nelt, 1))
15368     return false;
15369   high = maybe_ne (first, 0U);
15370
15371   /* Success!  */
15372   if (d->testing_p)
15373     return true;
15374
15375   in0 = d->op0;
15376   in1 = d->op1;
15377   /* We don't need a big-endian lane correction for SVE; see the comment
15378      at the head of aarch64-sve.md for details.  */
15379   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15380     {
15381       x = in0, in0 = in1, in1 = x;
15382       high = !high;
15383     }
15384   out = d->target;
15385
15386   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15387                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15388   return true;
15389 }
15390
15391 /* Recognize patterns for the EXT insn.  */
15392
15393 static bool
15394 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15395 {
15396   HOST_WIDE_INT location;
15397   rtx offset;
15398
15399   /* The first element always refers to the first vector.
15400      Check if the extracted indices are increasing by one.  */
15401   if (d->vec_flags == VEC_SVE_PRED
15402       || !d->perm[0].is_constant (&location)
15403       || !d->perm.series_p (0, 1, location, 1))
15404     return false;
15405
15406   /* Success! */
15407   if (d->testing_p)
15408     return true;
15409
15410   /* The case where (location == 0) is a no-op for both big- and little-endian,
15411      and is removed by the mid-end at optimization levels -O1 and higher.
15412
15413      We don't need a big-endian lane correction for SVE; see the comment
15414      at the head of aarch64-sve.md for details.  */
15415   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15416     {
15417       /* After setup, we want the high elements of the first vector (stored
15418          at the LSB end of the register), and the low elements of the second
15419          vector (stored at the MSB end of the register). So swap.  */
15420       std::swap (d->op0, d->op1);
15421       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15422          to_constant () is safe since this is restricted to Advanced SIMD
15423          vectors.  */
15424       location = d->perm.length ().to_constant () - location;
15425     }
15426
15427   offset = GEN_INT (location);
15428   emit_set_insn (d->target,
15429                  gen_rtx_UNSPEC (d->vmode,
15430                                  gen_rtvec (3, d->op0, d->op1, offset),
15431                                  UNSPEC_EXT));
15432   return true;
15433 }
15434
15435 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15436    within each 64-bit, 32-bit or 16-bit granule.  */
15437
15438 static bool
15439 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15440 {
15441   HOST_WIDE_INT diff;
15442   unsigned int i, size, unspec;
15443   machine_mode pred_mode;
15444
15445   if (d->vec_flags == VEC_SVE_PRED
15446       || !d->one_vector_p
15447       || !d->perm[0].is_constant (&diff))
15448     return false;
15449
15450   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15451   if (size == 8)
15452     {
15453       unspec = UNSPEC_REV64;
15454       pred_mode = VNx2BImode;
15455     }
15456   else if (size == 4)
15457     {
15458       unspec = UNSPEC_REV32;
15459       pred_mode = VNx4BImode;
15460     }
15461   else if (size == 2)
15462     {
15463       unspec = UNSPEC_REV16;
15464       pred_mode = VNx8BImode;
15465     }
15466   else
15467     return false;
15468
15469   unsigned int step = diff + 1;
15470   for (i = 0; i < step; ++i)
15471     if (!d->perm.series_p (i, step, diff - i, step))
15472       return false;
15473
15474   /* Success! */
15475   if (d->testing_p)
15476     return true;
15477
15478   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15479   if (d->vec_flags == VEC_SVE_DATA)
15480     {
15481       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15482       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15483                             UNSPEC_MERGE_PTRUE);
15484     }
15485   emit_set_insn (d->target, src);
15486   return true;
15487 }
15488
15489 /* Recognize patterns for the REV insn, which reverses elements within
15490    a full vector.  */
15491
15492 static bool
15493 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15494 {
15495   poly_uint64 nelt = d->perm.length ();
15496
15497   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15498     return false;
15499
15500   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15501     return false;
15502
15503   /* Success! */
15504   if (d->testing_p)
15505     return true;
15506
15507   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15508   emit_set_insn (d->target, src);
15509   return true;
15510 }
15511
15512 static bool
15513 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15514 {
15515   rtx out = d->target;
15516   rtx in0;
15517   HOST_WIDE_INT elt;
15518   machine_mode vmode = d->vmode;
15519   rtx lane;
15520
15521   if (d->vec_flags == VEC_SVE_PRED
15522       || d->perm.encoding ().encoded_nelts () != 1
15523       || !d->perm[0].is_constant (&elt))
15524     return false;
15525
15526   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15527     return false;
15528
15529   /* Success! */
15530   if (d->testing_p)
15531     return true;
15532
15533   /* The generic preparation in aarch64_expand_vec_perm_const_1
15534      swaps the operand order and the permute indices if it finds
15535      d->perm[0] to be in the second operand.  Thus, we can always
15536      use d->op0 and need not do any extra arithmetic to get the
15537      correct lane number.  */
15538   in0 = d->op0;
15539   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15540
15541   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15542   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15543   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15544   return true;
15545 }
15546
15547 static bool
15548 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15549 {
15550   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15551   machine_mode vmode = d->vmode;
15552
15553   /* Make sure that the indices are constant.  */
15554   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15555   for (unsigned int i = 0; i < encoded_nelts; ++i)
15556     if (!d->perm[i].is_constant ())
15557       return false;
15558
15559   if (d->testing_p)
15560     return true;
15561
15562   /* Generic code will try constant permutation twice.  Once with the
15563      original mode and again with the elements lowered to QImode.
15564      So wait and don't do the selector expansion ourselves.  */
15565   if (vmode != V8QImode && vmode != V16QImode)
15566     return false;
15567
15568   /* to_constant is safe since this routine is specific to Advanced SIMD
15569      vectors.  */
15570   unsigned int nelt = d->perm.length ().to_constant ();
15571   for (unsigned int i = 0; i < nelt; ++i)
15572     /* If big-endian and two vectors we end up with a weird mixed-endian
15573        mode on NEON.  Reverse the index within each word but not the word
15574        itself.  to_constant is safe because we checked is_constant above.  */
15575     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15576                         ? d->perm[i].to_constant () ^ (nelt - 1)
15577                         : d->perm[i].to_constant ());
15578
15579   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15580   sel = force_reg (vmode, sel);
15581
15582   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15583   return true;
15584 }
15585
15586 /* Try to implement D using an SVE TBL instruction.  */
15587
15588 static bool
15589 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15590 {
15591   unsigned HOST_WIDE_INT nelt;
15592
15593   /* Permuting two variable-length vectors could overflow the
15594      index range.  */
15595   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15596     return false;
15597
15598   if (d->testing_p)
15599     return true;
15600
15601   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15602   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15603   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15604   return true;
15605 }
15606
15607 static bool
15608 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15609 {
15610   /* The pattern matching functions above are written to look for a small
15611      number to begin the sequence (0, 1, N/2).  If we begin with an index
15612      from the second operand, we can swap the operands.  */
15613   poly_int64 nelt = d->perm.length ();
15614   if (known_ge (d->perm[0], nelt))
15615     {
15616       d->perm.rotate_inputs (1);
15617       std::swap (d->op0, d->op1);
15618     }
15619
15620   if ((d->vec_flags == VEC_ADVSIMD
15621        || d->vec_flags == VEC_SVE_DATA
15622        || d->vec_flags == VEC_SVE_PRED)
15623       && known_gt (nelt, 1))
15624     {
15625       if (aarch64_evpc_rev_local (d))
15626         return true;
15627       else if (aarch64_evpc_rev_global (d))
15628         return true;
15629       else if (aarch64_evpc_ext (d))
15630         return true;
15631       else if (aarch64_evpc_dup (d))
15632         return true;
15633       else if (aarch64_evpc_zip (d))
15634         return true;
15635       else if (aarch64_evpc_uzp (d))
15636         return true;
15637       else if (aarch64_evpc_trn (d))
15638         return true;
15639       if (d->vec_flags == VEC_SVE_DATA)
15640         return aarch64_evpc_sve_tbl (d);
15641       else if (d->vec_flags == VEC_SVE_DATA)
15642         return aarch64_evpc_tbl (d);
15643     }
15644   return false;
15645 }
15646
15647 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15648
15649 static bool
15650 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15651                                   rtx op1, const vec_perm_indices &sel)
15652 {
15653   struct expand_vec_perm_d d;
15654
15655   /* Check whether the mask can be applied to a single vector.  */
15656   if (op0 && rtx_equal_p (op0, op1))
15657     d.one_vector_p = true;
15658   else if (sel.all_from_input_p (0))
15659     {
15660       d.one_vector_p = true;
15661       op1 = op0;
15662     }
15663   else if (sel.all_from_input_p (1))
15664     {
15665       d.one_vector_p = true;
15666       op0 = op1;
15667     }
15668   else
15669     d.one_vector_p = false;
15670
15671   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15672                      sel.nelts_per_input ());
15673   d.vmode = vmode;
15674   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15675   d.target = target;
15676   d.op0 = op0;
15677   d.op1 = op1;
15678   d.testing_p = !target;
15679
15680   if (!d.testing_p)
15681     return aarch64_expand_vec_perm_const_1 (&d);
15682
15683   rtx_insn *last = get_last_insn ();
15684   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15685   gcc_assert (last == get_last_insn ());
15686
15687   return ret;
15688 }
15689
15690 /* Generate a byte permute mask for a register of mode MODE,
15691    which has NUNITS units.  */
15692
15693 rtx
15694 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15695 {
15696   /* We have to reverse each vector because we dont have
15697      a permuted load that can reverse-load according to ABI rules.  */
15698   rtx mask;
15699   rtvec v = rtvec_alloc (16);
15700   unsigned int i, j;
15701   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15702
15703   gcc_assert (BYTES_BIG_ENDIAN);
15704   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15705
15706   for (i = 0; i < nunits; i++)
15707     for (j = 0; j < usize; j++)
15708       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15709   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15710   return force_reg (V16QImode, mask);
15711 }
15712
15713 /* Return true if X is a valid second operand for the SVE instruction
15714    that implements integer comparison OP_CODE.  */
15715
15716 static bool
15717 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15718 {
15719   if (register_operand (x, VOIDmode))
15720     return true;
15721
15722   switch (op_code)
15723     {
15724     case LTU:
15725     case LEU:
15726     case GEU:
15727     case GTU:
15728       return aarch64_sve_cmp_immediate_p (x, false);
15729     case LT:
15730     case LE:
15731     case GE:
15732     case GT:
15733     case NE:
15734     case EQ:
15735       return aarch64_sve_cmp_immediate_p (x, true);
15736     default:
15737       gcc_unreachable ();
15738     }
15739 }
15740
15741 /* Use predicated SVE instructions to implement the equivalent of:
15742
15743      (set TARGET OP)
15744
15745    given that PTRUE is an all-true predicate of the appropriate mode.  */
15746
15747 static void
15748 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15749 {
15750   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15751                                gen_rtvec (2, ptrue, op),
15752                                UNSPEC_MERGE_PTRUE);
15753   rtx_insn *insn = emit_set_insn (target, unspec);
15754   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15755 }
15756
15757 /* Likewise, but also clobber the condition codes.  */
15758
15759 static void
15760 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15761 {
15762   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15763                                gen_rtvec (2, ptrue, op),
15764                                UNSPEC_MERGE_PTRUE);
15765   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15766   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15767 }
15768
15769 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15770
15771 static unsigned int
15772 aarch64_unspec_cond_code (rtx_code code)
15773 {
15774   switch (code)
15775     {
15776     case NE:
15777       return UNSPEC_COND_NE;
15778     case EQ:
15779       return UNSPEC_COND_EQ;
15780     case LT:
15781       return UNSPEC_COND_LT;
15782     case GT:
15783       return UNSPEC_COND_GT;
15784     case LE:
15785       return UNSPEC_COND_LE;
15786     case GE:
15787       return UNSPEC_COND_GE;
15788     default:
15789       gcc_unreachable ();
15790     }
15791 }
15792
15793 /* Emit:
15794
15795       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15796
15797    where <X> is the operation associated with comparison CODE.  This form
15798    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15799    semantics, such as when PRED might not be all-true and when comparing
15800    inactive lanes could have side effects.  */
15801
15802 static void
15803 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15804                                   rtx pred, rtx op0, rtx op1)
15805 {
15806   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15807                                gen_rtvec (3, pred, op0, op1),
15808                                aarch64_unspec_cond_code (code));
15809   emit_set_insn (target, unspec);
15810 }
15811
15812 /* Expand an SVE integer comparison using the SVE equivalent of:
15813
15814      (set TARGET (CODE OP0 OP1)).  */
15815
15816 void
15817 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15818 {
15819   machine_mode pred_mode = GET_MODE (target);
15820   machine_mode data_mode = GET_MODE (op0);
15821
15822   if (!aarch64_sve_cmp_operand_p (code, op1))
15823     op1 = force_reg (data_mode, op1);
15824
15825   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15826   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15827   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15828 }
15829
15830 /* Emit the SVE equivalent of:
15831
15832       (set TMP1 (CODE1 OP0 OP1))
15833       (set TMP2 (CODE2 OP0 OP1))
15834       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15835
15836    PTRUE is an all-true predicate with the same mode as TARGET.  */
15837
15838 static void
15839 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15840                            rtx ptrue, rtx op0, rtx op1)
15841 {
15842   machine_mode pred_mode = GET_MODE (ptrue);
15843   rtx tmp1 = gen_reg_rtx (pred_mode);
15844   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15845                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15846   rtx tmp2 = gen_reg_rtx (pred_mode);
15847   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15848                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15849   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15850 }
15851
15852 /* Emit the SVE equivalent of:
15853
15854       (set TMP (CODE OP0 OP1))
15855       (set TARGET (not TMP))
15856
15857    PTRUE is an all-true predicate with the same mode as TARGET.  */
15858
15859 static void
15860 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15861                                 rtx op0, rtx op1)
15862 {
15863   machine_mode pred_mode = GET_MODE (ptrue);
15864   rtx tmp = gen_reg_rtx (pred_mode);
15865   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15866                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15867   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15868 }
15869
15870 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15871
15872      (set TARGET (CODE OP0 OP1))
15873
15874    If CAN_INVERT_P is true, the caller can also handle inverted results;
15875    return true if the result is in fact inverted.  */
15876
15877 bool
15878 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15879                                   rtx op0, rtx op1, bool can_invert_p)
15880 {
15881   machine_mode pred_mode = GET_MODE (target);
15882   machine_mode data_mode = GET_MODE (op0);
15883
15884   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15885   switch (code)
15886     {
15887     case UNORDERED:
15888       /* UNORDERED has no immediate form.  */
15889       op1 = force_reg (data_mode, op1);
15890       /* fall through */
15891     case LT:
15892     case LE:
15893     case GT:
15894     case GE:
15895     case EQ:
15896     case NE:
15897       {
15898         /* There is native support for the comparison.  */
15899         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15900         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15901         return false;
15902       }
15903
15904     case LTGT:
15905       /* This is a trapping operation (LT or GT).  */
15906       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15907       return false;
15908
15909     case UNEQ:
15910       if (!flag_trapping_math)
15911         {
15912           /* This would trap for signaling NaNs.  */
15913           op1 = force_reg (data_mode, op1);
15914           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15915           return false;
15916         }
15917       /* fall through */
15918     case UNLT:
15919     case UNLE:
15920     case UNGT:
15921     case UNGE:
15922       if (flag_trapping_math)
15923         {
15924           /* Work out which elements are ordered.  */
15925           rtx ordered = gen_reg_rtx (pred_mode);
15926           op1 = force_reg (data_mode, op1);
15927           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15928
15929           /* Test the opposite condition for the ordered elements,
15930              then invert the result.  */
15931           if (code == UNEQ)
15932             code = NE;
15933           else
15934             code = reverse_condition_maybe_unordered (code);
15935           if (can_invert_p)
15936             {
15937               aarch64_emit_sve_predicated_cond (target, code,
15938                                                 ordered, op0, op1);
15939               return true;
15940             }
15941           rtx tmp = gen_reg_rtx (pred_mode);
15942           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15943           aarch64_emit_unop (target, one_cmpl_optab, tmp);
15944           return false;
15945         }
15946       break;
15947
15948     case ORDERED:
15949       /* ORDERED has no immediate form.  */
15950       op1 = force_reg (data_mode, op1);
15951       break;
15952
15953     default:
15954       gcc_unreachable ();
15955     }
15956
15957   /* There is native support for the inverse comparison.  */
15958   code = reverse_condition_maybe_unordered (code);
15959   if (can_invert_p)
15960     {
15961       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15962       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15963       return true;
15964     }
15965   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15966   return false;
15967 }
15968
15969 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15970    of the data being selected and CMP_MODE is the mode of the values being
15971    compared.  */
15972
15973 void
15974 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15975                           rtx *ops)
15976 {
15977   machine_mode pred_mode
15978     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15979                              GET_MODE_SIZE (cmp_mode)).require ();
15980   rtx pred = gen_reg_rtx (pred_mode);
15981   if (FLOAT_MODE_P (cmp_mode))
15982     {
15983       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15984                                             ops[4], ops[5], true))
15985         std::swap (ops[1], ops[2]);
15986     }
15987   else
15988     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15989
15990   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15991   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15992 }
15993
15994 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15995    true.  However due to issues with register allocation it is preferable
15996    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15997    operations in general registers is better than treating them as scalar
15998    vector operations.  This reduces latency and avoids redundant int<->FP
15999    moves.  So tie modes if they are either the same class, or vector modes
16000    with other vector modes, vector structs or any scalar mode.  */
16001
16002 static bool
16003 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16004 {
16005   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16006     return true;
16007
16008   /* We specifically want to allow elements of "structure" modes to
16009      be tieable to the structure.  This more general condition allows
16010      other rarer situations too.  The reason we don't extend this to
16011      predicate modes is that there are no predicate structure modes
16012      nor any specific instructions for extracting part of a predicate
16013      register.  */
16014   if (aarch64_vector_data_mode_p (mode1)
16015       && aarch64_vector_data_mode_p (mode2))
16016     return true;
16017
16018   /* Also allow any scalar modes with vectors.  */
16019   if (aarch64_vector_mode_supported_p (mode1)
16020       || aarch64_vector_mode_supported_p (mode2))
16021     return true;
16022
16023   return false;
16024 }
16025
16026 /* Return a new RTX holding the result of moving POINTER forward by
16027    AMOUNT bytes.  */
16028
16029 static rtx
16030 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16031 {
16032   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16033
16034   return adjust_automodify_address (pointer, GET_MODE (pointer),
16035                                     next, amount);
16036 }
16037
16038 /* Return a new RTX holding the result of moving POINTER forward by the
16039    size of the mode it points to.  */
16040
16041 static rtx
16042 aarch64_progress_pointer (rtx pointer)
16043 {
16044   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16045 }
16046
16047 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16048    MODE bytes.  */
16049
16050 static void
16051 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16052                                               machine_mode mode)
16053 {
16054   rtx reg = gen_reg_rtx (mode);
16055
16056   /* "Cast" the pointers to the correct mode.  */
16057   *src = adjust_address (*src, mode, 0);
16058   *dst = adjust_address (*dst, mode, 0);
16059   /* Emit the memcpy.  */
16060   emit_move_insn (reg, *src);
16061   emit_move_insn (*dst, reg);
16062   /* Move the pointers forward.  */
16063   *src = aarch64_progress_pointer (*src);
16064   *dst = aarch64_progress_pointer (*dst);
16065 }
16066
16067 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16068    we succeed, otherwise return false.  */
16069
16070 bool
16071 aarch64_expand_movmem (rtx *operands)
16072 {
16073   unsigned int n;
16074   rtx dst = operands[0];
16075   rtx src = operands[1];
16076   rtx base;
16077   bool speed_p = !optimize_function_for_size_p (cfun);
16078
16079   /* When optimizing for size, give a better estimate of the length of a
16080      memcpy call, but use the default otherwise.  */
16081   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16082
16083   /* We can't do anything smart if the amount to copy is not constant.  */
16084   if (!CONST_INT_P (operands[2]))
16085     return false;
16086
16087   n = UINTVAL (operands[2]);
16088
16089   /* Try to keep the number of instructions low.  For cases below 16 bytes we
16090      need to make at most two moves.  For cases above 16 bytes it will be one
16091      move for each 16 byte chunk, then at most two additional moves.  */
16092   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16093     return false;
16094
16095   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16096   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16097
16098   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16099   src = adjust_automodify_address (src, VOIDmode, base, 0);
16100
16101   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16102      1-byte chunk.  */
16103   if (n < 4)
16104     {
16105       if (n >= 2)
16106         {
16107           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16108           n -= 2;
16109         }
16110
16111       if (n == 1)
16112         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16113
16114       return true;
16115     }
16116
16117   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
16118      4-byte chunk, partially overlapping with the previously copied chunk.  */
16119   if (n < 8)
16120     {
16121       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16122       n -= 4;
16123       if (n > 0)
16124         {
16125           int move = n - 4;
16126
16127           src = aarch64_move_pointer (src, move);
16128           dst = aarch64_move_pointer (dst, move);
16129           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16130         }
16131       return true;
16132     }
16133
16134   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
16135      them, then (if applicable) an 8-byte chunk.  */
16136   while (n >= 8)
16137     {
16138       if (n / 16)
16139         {
16140           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16141           n -= 16;
16142         }
16143       else
16144         {
16145           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16146           n -= 8;
16147         }
16148     }
16149
16150   /* Finish the final bytes of the copy.  We can always do this in one
16151      instruction.  We either copy the exact amount we need, or partially
16152      overlap with the previous chunk we copied and copy 8-bytes.  */
16153   if (n == 0)
16154     return true;
16155   else if (n == 1)
16156     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16157   else if (n == 2)
16158     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16159   else if (n == 4)
16160     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16161   else
16162     {
16163       if (n == 3)
16164         {
16165           src = aarch64_move_pointer (src, -1);
16166           dst = aarch64_move_pointer (dst, -1);
16167           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16168         }
16169       else
16170         {
16171           int move = n - 8;
16172
16173           src = aarch64_move_pointer (src, move);
16174           dst = aarch64_move_pointer (dst, move);
16175           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16176         }
16177     }
16178
16179   return true;
16180 }
16181
16182 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16183    SImode stores.  Handle the case when the constant has identical
16184    bottom and top halves.  This is beneficial when the two stores can be
16185    merged into an STP and we avoid synthesising potentially expensive
16186    immediates twice.  Return true if such a split is possible.  */
16187
16188 bool
16189 aarch64_split_dimode_const_store (rtx dst, rtx src)
16190 {
16191   rtx lo = gen_lowpart (SImode, src);
16192   rtx hi = gen_highpart_mode (SImode, DImode, src);
16193
16194   bool size_p = optimize_function_for_size_p (cfun);
16195
16196   if (!rtx_equal_p (lo, hi))
16197     return false;
16198
16199   unsigned int orig_cost
16200     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16201   unsigned int lo_cost
16202     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16203
16204   /* We want to transform:
16205      MOV        x1, 49370
16206      MOVK       x1, 0x140, lsl 16
16207      MOVK       x1, 0xc0da, lsl 32
16208      MOVK       x1, 0x140, lsl 48
16209      STR        x1, [x0]
16210    into:
16211      MOV        w1, 49370
16212      MOVK       w1, 0x140, lsl 16
16213      STP        w1, w1, [x0]
16214    So we want to perform this only when we save two instructions
16215    or more.  When optimizing for size, however, accept any code size
16216    savings we can.  */
16217   if (size_p && orig_cost <= lo_cost)
16218     return false;
16219
16220   if (!size_p
16221       && (orig_cost <= lo_cost + 1))
16222     return false;
16223
16224   rtx mem_lo = adjust_address (dst, SImode, 0);
16225   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16226     return false;
16227
16228   rtx tmp_reg = gen_reg_rtx (SImode);
16229   aarch64_expand_mov_immediate (tmp_reg, lo);
16230   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16231   /* Don't emit an explicit store pair as this may not be always profitable.
16232      Let the sched-fusion logic decide whether to merge them.  */
16233   emit_move_insn (mem_lo, tmp_reg);
16234   emit_move_insn (mem_hi, tmp_reg);
16235
16236   return true;
16237 }
16238
16239 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16240
16241 static unsigned HOST_WIDE_INT
16242 aarch64_asan_shadow_offset (void)
16243 {
16244   return (HOST_WIDE_INT_1 << 36);
16245 }
16246
16247 static rtx
16248 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16249                         int code, tree treeop0, tree treeop1)
16250 {
16251   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16252   rtx op0, op1;
16253   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16254   insn_code icode;
16255   struct expand_operand ops[4];
16256
16257   start_sequence ();
16258   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16259
16260   op_mode = GET_MODE (op0);
16261   if (op_mode == VOIDmode)
16262     op_mode = GET_MODE (op1);
16263
16264   switch (op_mode)
16265     {
16266     case E_QImode:
16267     case E_HImode:
16268     case E_SImode:
16269       cmp_mode = SImode;
16270       icode = CODE_FOR_cmpsi;
16271       break;
16272
16273     case E_DImode:
16274       cmp_mode = DImode;
16275       icode = CODE_FOR_cmpdi;
16276       break;
16277
16278     case E_SFmode:
16279       cmp_mode = SFmode;
16280       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16281       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16282       break;
16283
16284     case E_DFmode:
16285       cmp_mode = DFmode;
16286       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16287       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16288       break;
16289
16290     default:
16291       end_sequence ();
16292       return NULL_RTX;
16293     }
16294
16295   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16296   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16297   if (!op0 || !op1)
16298     {
16299       end_sequence ();
16300       return NULL_RTX;
16301     }
16302   *prep_seq = get_insns ();
16303   end_sequence ();
16304
16305   create_fixed_operand (&ops[0], op0);
16306   create_fixed_operand (&ops[1], op1);
16307
16308   start_sequence ();
16309   if (!maybe_expand_insn (icode, 2, ops))
16310     {
16311       end_sequence ();
16312       return NULL_RTX;
16313     }
16314   *gen_seq = get_insns ();
16315   end_sequence ();
16316
16317   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16318                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16319 }
16320
16321 static rtx
16322 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16323                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16324 {
16325   rtx op0, op1, target;
16326   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16327   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16328   insn_code icode;
16329   struct expand_operand ops[6];
16330   int aarch64_cond;
16331
16332   push_to_sequence (*prep_seq);
16333   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16334
16335   op_mode = GET_MODE (op0);
16336   if (op_mode == VOIDmode)
16337     op_mode = GET_MODE (op1);
16338
16339   switch (op_mode)
16340     {
16341     case E_QImode:
16342     case E_HImode:
16343     case E_SImode:
16344       cmp_mode = SImode;
16345       icode = CODE_FOR_ccmpsi;
16346       break;
16347
16348     case E_DImode:
16349       cmp_mode = DImode;
16350       icode = CODE_FOR_ccmpdi;
16351       break;
16352
16353     case E_SFmode:
16354       cmp_mode = SFmode;
16355       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16356       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16357       break;
16358
16359     case E_DFmode:
16360       cmp_mode = DFmode;
16361       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16362       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16363       break;
16364
16365     default:
16366       end_sequence ();
16367       return NULL_RTX;
16368     }
16369
16370   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16371   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16372   if (!op0 || !op1)
16373     {
16374       end_sequence ();
16375       return NULL_RTX;
16376     }
16377   *prep_seq = get_insns ();
16378   end_sequence ();
16379
16380   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16381   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16382
16383   if (bit_code != AND)
16384     {
16385       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16386                                                 GET_MODE (XEXP (prev, 0))),
16387                              VOIDmode, XEXP (prev, 0), const0_rtx);
16388       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16389     }
16390
16391   create_fixed_operand (&ops[0], XEXP (prev, 0));
16392   create_fixed_operand (&ops[1], target);
16393   create_fixed_operand (&ops[2], op0);
16394   create_fixed_operand (&ops[3], op1);
16395   create_fixed_operand (&ops[4], prev);
16396   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16397
16398   push_to_sequence (*gen_seq);
16399   if (!maybe_expand_insn (icode, 6, ops))
16400     {
16401       end_sequence ();
16402       return NULL_RTX;
16403     }
16404
16405   *gen_seq = get_insns ();
16406   end_sequence ();
16407
16408   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16409 }
16410
16411 #undef TARGET_GEN_CCMP_FIRST
16412 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16413
16414 #undef TARGET_GEN_CCMP_NEXT
16415 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16416
16417 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16418    instruction fusion of some sort.  */
16419
16420 static bool
16421 aarch64_macro_fusion_p (void)
16422 {
16423   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16424 }
16425
16426
16427 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16428    should be kept together during scheduling.  */
16429
16430 static bool
16431 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16432 {
16433   rtx set_dest;
16434   rtx prev_set = single_set (prev);
16435   rtx curr_set = single_set (curr);
16436   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16437   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16438
16439   if (!aarch64_macro_fusion_p ())
16440     return false;
16441
16442   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16443     {
16444       /* We are trying to match:
16445          prev (mov)  == (set (reg r0) (const_int imm16))
16446          curr (movk) == (set (zero_extract (reg r0)
16447                                            (const_int 16)
16448                                            (const_int 16))
16449                              (const_int imm16_1))  */
16450
16451       set_dest = SET_DEST (curr_set);
16452
16453       if (GET_CODE (set_dest) == ZERO_EXTRACT
16454           && CONST_INT_P (SET_SRC (curr_set))
16455           && CONST_INT_P (SET_SRC (prev_set))
16456           && CONST_INT_P (XEXP (set_dest, 2))
16457           && INTVAL (XEXP (set_dest, 2)) == 16
16458           && REG_P (XEXP (set_dest, 0))
16459           && REG_P (SET_DEST (prev_set))
16460           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16461         {
16462           return true;
16463         }
16464     }
16465
16466   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16467     {
16468
16469       /*  We're trying to match:
16470           prev (adrp) == (set (reg r1)
16471                               (high (symbol_ref ("SYM"))))
16472           curr (add) == (set (reg r0)
16473                              (lo_sum (reg r1)
16474                                      (symbol_ref ("SYM"))))
16475           Note that r0 need not necessarily be the same as r1, especially
16476           during pre-regalloc scheduling.  */
16477
16478       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16479           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16480         {
16481           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16482               && REG_P (XEXP (SET_SRC (curr_set), 0))
16483               && REGNO (XEXP (SET_SRC (curr_set), 0))
16484                  == REGNO (SET_DEST (prev_set))
16485               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16486                               XEXP (SET_SRC (curr_set), 1)))
16487             return true;
16488         }
16489     }
16490
16491   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16492     {
16493
16494       /* We're trying to match:
16495          prev (movk) == (set (zero_extract (reg r0)
16496                                            (const_int 16)
16497                                            (const_int 32))
16498                              (const_int imm16_1))
16499          curr (movk) == (set (zero_extract (reg r0)
16500                                            (const_int 16)
16501                                            (const_int 48))
16502                              (const_int imm16_2))  */
16503
16504       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16505           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16506           && REG_P (XEXP (SET_DEST (prev_set), 0))
16507           && REG_P (XEXP (SET_DEST (curr_set), 0))
16508           && REGNO (XEXP (SET_DEST (prev_set), 0))
16509              == REGNO (XEXP (SET_DEST (curr_set), 0))
16510           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16511           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16512           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16513           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16514           && CONST_INT_P (SET_SRC (prev_set))
16515           && CONST_INT_P (SET_SRC (curr_set)))
16516         return true;
16517
16518     }
16519   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16520     {
16521       /* We're trying to match:
16522           prev (adrp) == (set (reg r0)
16523                               (high (symbol_ref ("SYM"))))
16524           curr (ldr) == (set (reg r1)
16525                              (mem (lo_sum (reg r0)
16526                                              (symbol_ref ("SYM")))))
16527                  or
16528           curr (ldr) == (set (reg r1)
16529                              (zero_extend (mem
16530                                            (lo_sum (reg r0)
16531                                                    (symbol_ref ("SYM"))))))  */
16532       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16533           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16534         {
16535           rtx curr_src = SET_SRC (curr_set);
16536
16537           if (GET_CODE (curr_src) == ZERO_EXTEND)
16538             curr_src = XEXP (curr_src, 0);
16539
16540           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16541               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16542               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16543                  == REGNO (SET_DEST (prev_set))
16544               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16545                               XEXP (SET_SRC (prev_set), 0)))
16546               return true;
16547         }
16548     }
16549
16550   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16551        && aarch_crypto_can_dual_issue (prev, curr))
16552     return true;
16553
16554   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16555       && any_condjump_p (curr))
16556     {
16557       enum attr_type prev_type = get_attr_type (prev);
16558
16559       unsigned int condreg1, condreg2;
16560       rtx cc_reg_1;
16561       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16562       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16563
16564       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16565           && prev
16566           && modified_in_p (cc_reg_1, prev))
16567         {
16568           /* FIXME: this misses some which is considered simple arthematic
16569              instructions for ThunderX.  Simple shifts are missed here.  */
16570           if (prev_type == TYPE_ALUS_SREG
16571               || prev_type == TYPE_ALUS_IMM
16572               || prev_type == TYPE_LOGICS_REG
16573               || prev_type == TYPE_LOGICS_IMM)
16574             return true;
16575         }
16576     }
16577
16578   if (prev_set
16579       && curr_set
16580       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16581       && any_condjump_p (curr))
16582     {
16583       /* We're trying to match:
16584           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16585           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16586                                                          (const_int 0))
16587                                                  (label_ref ("SYM"))
16588                                                  (pc))  */
16589       if (SET_DEST (curr_set) == (pc_rtx)
16590           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16591           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16592           && REG_P (SET_DEST (prev_set))
16593           && REGNO (SET_DEST (prev_set))
16594              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16595         {
16596           /* Fuse ALU operations followed by conditional branch instruction.  */
16597           switch (get_attr_type (prev))
16598             {
16599             case TYPE_ALU_IMM:
16600             case TYPE_ALU_SREG:
16601             case TYPE_ADC_REG:
16602             case TYPE_ADC_IMM:
16603             case TYPE_ADCS_REG:
16604             case TYPE_ADCS_IMM:
16605             case TYPE_LOGIC_REG:
16606             case TYPE_LOGIC_IMM:
16607             case TYPE_CSEL:
16608             case TYPE_ADR:
16609             case TYPE_MOV_IMM:
16610             case TYPE_SHIFT_REG:
16611             case TYPE_SHIFT_IMM:
16612             case TYPE_BFM:
16613             case TYPE_RBIT:
16614             case TYPE_REV:
16615             case TYPE_EXTEND:
16616               return true;
16617
16618             default:;
16619             }
16620         }
16621     }
16622
16623   return false;
16624 }
16625
16626 /* Return true iff the instruction fusion described by OP is enabled.  */
16627
16628 bool
16629 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16630 {
16631   return (aarch64_tune_params.fusible_ops & op) != 0;
16632 }
16633
16634 /* If MEM is in the form of [base+offset], extract the two parts
16635    of address and set to BASE and OFFSET, otherwise return false
16636    after clearing BASE and OFFSET.  */
16637
16638 bool
16639 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16640 {
16641   rtx addr;
16642
16643   gcc_assert (MEM_P (mem));
16644
16645   addr = XEXP (mem, 0);
16646
16647   if (REG_P (addr))
16648     {
16649       *base = addr;
16650       *offset = const0_rtx;
16651       return true;
16652     }
16653
16654   if (GET_CODE (addr) == PLUS
16655       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16656     {
16657       *base = XEXP (addr, 0);
16658       *offset = XEXP (addr, 1);
16659       return true;
16660     }
16661
16662   *base = NULL_RTX;
16663   *offset = NULL_RTX;
16664
16665   return false;
16666 }
16667
16668 /* Types for scheduling fusion.  */
16669 enum sched_fusion_type
16670 {
16671   SCHED_FUSION_NONE = 0,
16672   SCHED_FUSION_LD_SIGN_EXTEND,
16673   SCHED_FUSION_LD_ZERO_EXTEND,
16674   SCHED_FUSION_LD,
16675   SCHED_FUSION_ST,
16676   SCHED_FUSION_NUM
16677 };
16678
16679 /* If INSN is a load or store of address in the form of [base+offset],
16680    extract the two parts and set to BASE and OFFSET.  Return scheduling
16681    fusion type this INSN is.  */
16682
16683 static enum sched_fusion_type
16684 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16685 {
16686   rtx x, dest, src;
16687   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16688
16689   gcc_assert (INSN_P (insn));
16690   x = PATTERN (insn);
16691   if (GET_CODE (x) != SET)
16692     return SCHED_FUSION_NONE;
16693
16694   src = SET_SRC (x);
16695   dest = SET_DEST (x);
16696
16697   machine_mode dest_mode = GET_MODE (dest);
16698
16699   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16700     return SCHED_FUSION_NONE;
16701
16702   if (GET_CODE (src) == SIGN_EXTEND)
16703     {
16704       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16705       src = XEXP (src, 0);
16706       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16707         return SCHED_FUSION_NONE;
16708     }
16709   else if (GET_CODE (src) == ZERO_EXTEND)
16710     {
16711       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16712       src = XEXP (src, 0);
16713       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16714         return SCHED_FUSION_NONE;
16715     }
16716
16717   if (GET_CODE (src) == MEM && REG_P (dest))
16718     extract_base_offset_in_addr (src, base, offset);
16719   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16720     {
16721       fusion = SCHED_FUSION_ST;
16722       extract_base_offset_in_addr (dest, base, offset);
16723     }
16724   else
16725     return SCHED_FUSION_NONE;
16726
16727   if (*base == NULL_RTX || *offset == NULL_RTX)
16728     fusion = SCHED_FUSION_NONE;
16729
16730   return fusion;
16731 }
16732
16733 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16734
16735    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16736    and PRI are only calculated for these instructions.  For other instruction,
16737    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16738    type instruction fusion can be added by returning different priorities.
16739
16740    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16741
16742 static void
16743 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16744                                int *fusion_pri, int *pri)
16745 {
16746   int tmp, off_val;
16747   rtx base, offset;
16748   enum sched_fusion_type fusion;
16749
16750   gcc_assert (INSN_P (insn));
16751
16752   tmp = max_pri - 1;
16753   fusion = fusion_load_store (insn, &base, &offset);
16754   if (fusion == SCHED_FUSION_NONE)
16755     {
16756       *pri = tmp;
16757       *fusion_pri = tmp;
16758       return;
16759     }
16760
16761   /* Set FUSION_PRI according to fusion type and base register.  */
16762   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16763
16764   /* Calculate PRI.  */
16765   tmp /= 2;
16766
16767   /* INSN with smaller offset goes first.  */
16768   off_val = (int)(INTVAL (offset));
16769   if (off_val >= 0)
16770     tmp -= (off_val & 0xfffff);
16771   else
16772     tmp += ((- off_val) & 0xfffff);
16773
16774   *pri = tmp;
16775   return;
16776 }
16777
16778 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16779    Adjust priority of sha1h instructions so they are scheduled before
16780    other SHA1 instructions.  */
16781
16782 static int
16783 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16784 {
16785   rtx x = PATTERN (insn);
16786
16787   if (GET_CODE (x) == SET)
16788     {
16789       x = SET_SRC (x);
16790
16791       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16792         return priority + 10;
16793     }
16794
16795   return priority;
16796 }
16797
16798 /* Given OPERANDS of consecutive load/store, check if we can merge
16799    them into ldp/stp.  LOAD is true if they are load instructions.
16800    MODE is the mode of memory operands.  */
16801
16802 bool
16803 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16804                                 machine_mode mode)
16805 {
16806   HOST_WIDE_INT offval_1, offval_2, msize;
16807   enum reg_class rclass_1, rclass_2;
16808   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16809
16810   if (load)
16811     {
16812       mem_1 = operands[1];
16813       mem_2 = operands[3];
16814       reg_1 = operands[0];
16815       reg_2 = operands[2];
16816       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16817       if (REGNO (reg_1) == REGNO (reg_2))
16818         return false;
16819     }
16820   else
16821     {
16822       mem_1 = operands[0];
16823       mem_2 = operands[2];
16824       reg_1 = operands[1];
16825       reg_2 = operands[3];
16826     }
16827
16828   /* The mems cannot be volatile.  */
16829   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16830     return false;
16831
16832   /* If we have SImode and slow unaligned ldp,
16833      check the alignment to be at least 8 byte. */
16834   if (mode == SImode
16835       && (aarch64_tune_params.extra_tuning_flags
16836           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16837       && !optimize_size
16838       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16839     return false;
16840
16841   /* Check if the addresses are in the form of [base+offset].  */
16842   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16843   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16844     return false;
16845   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16846   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16847     return false;
16848
16849   /* Check if the bases are same.  */
16850   if (!rtx_equal_p (base_1, base_2))
16851     return false;
16852
16853   /* The operands must be of the same size.  */
16854   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16855                          GET_MODE_SIZE (GET_MODE (mem_2))));
16856
16857   offval_1 = INTVAL (offset_1);
16858   offval_2 = INTVAL (offset_2);
16859   /* We should only be trying this for fixed-sized modes.  There is no
16860      SVE LDP/STP instruction.  */
16861   msize = GET_MODE_SIZE (mode).to_constant ();
16862   /* Check if the offsets are consecutive.  */
16863   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16864     return false;
16865
16866   /* Check if the addresses are clobbered by load.  */
16867   if (load)
16868     {
16869       if (reg_mentioned_p (reg_1, mem_1))
16870         return false;
16871
16872       /* In increasing order, the last load can clobber the address.  */
16873       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16874         return false;
16875     }
16876
16877   /* One of the memory accesses must be a mempair operand.
16878      If it is not the first one, they need to be swapped by the
16879      peephole.  */
16880   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16881        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16882     return false;
16883
16884   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16885     rclass_1 = FP_REGS;
16886   else
16887     rclass_1 = GENERAL_REGS;
16888
16889   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16890     rclass_2 = FP_REGS;
16891   else
16892     rclass_2 = GENERAL_REGS;
16893
16894   /* Check if the registers are of same class.  */
16895   if (rclass_1 != rclass_2)
16896     return false;
16897
16898   return true;
16899 }
16900
16901 /* Given OPERANDS of consecutive load/store that can be merged,
16902    swap them if they are not in ascending order.  */
16903 void
16904 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16905 {
16906   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16907   HOST_WIDE_INT offval_1, offval_2;
16908
16909   if (load)
16910     {
16911       mem_1 = operands[1];
16912       mem_2 = operands[3];
16913     }
16914   else
16915     {
16916       mem_1 = operands[0];
16917       mem_2 = operands[2];
16918     }
16919
16920   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16921   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16922
16923   offval_1 = INTVAL (offset_1);
16924   offval_2 = INTVAL (offset_2);
16925
16926   if (offval_1 > offval_2)
16927     {
16928       /* Irrespective of whether this is a load or a store,
16929          we do the same swap.  */
16930       std::swap (operands[0], operands[2]);
16931       std::swap (operands[1], operands[3]);
16932     }
16933 }
16934
16935 /* Given OPERANDS of consecutive load/store, check if we can merge
16936    them into ldp/stp by adjusting the offset.  LOAD is true if they
16937    are load instructions.  MODE is the mode of memory operands.
16938
16939    Given below consecutive stores:
16940
16941      str  w1, [xb, 0x100]
16942      str  w1, [xb, 0x104]
16943      str  w1, [xb, 0x108]
16944      str  w1, [xb, 0x10c]
16945
16946    Though the offsets are out of the range supported by stp, we can
16947    still pair them after adjusting the offset, like:
16948
16949      add  scratch, xb, 0x100
16950      stp  w1, w1, [scratch]
16951      stp  w1, w1, [scratch, 0x8]
16952
16953    The peephole patterns detecting this opportunity should guarantee
16954    the scratch register is avaliable.  */
16955
16956 bool
16957 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16958                                        scalar_mode mode)
16959 {
16960   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16961   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16962   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16963   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16964
16965   if (load)
16966     {
16967       reg_1 = operands[0];
16968       mem_1 = operands[1];
16969       reg_2 = operands[2];
16970       mem_2 = operands[3];
16971       reg_3 = operands[4];
16972       mem_3 = operands[5];
16973       reg_4 = operands[6];
16974       mem_4 = operands[7];
16975       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16976                   && REG_P (reg_3) && REG_P (reg_4));
16977       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16978         return false;
16979     }
16980   else
16981     {
16982       mem_1 = operands[0];
16983       reg_1 = operands[1];
16984       mem_2 = operands[2];
16985       reg_2 = operands[3];
16986       mem_3 = operands[4];
16987       reg_3 = operands[5];
16988       mem_4 = operands[6];
16989       reg_4 = operands[7];
16990     }
16991   /* Skip if memory operand is by itslef valid for ldp/stp.  */
16992   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16993     return false;
16994
16995   /* The mems cannot be volatile.  */
16996   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16997       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16998     return false;
16999
17000   /* Check if the addresses are in the form of [base+offset].  */
17001   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17002   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17003     return false;
17004   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17005   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17006     return false;
17007   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17008   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17009     return false;
17010   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17011   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17012     return false;
17013
17014   /* Check if the bases are same.  */
17015   if (!rtx_equal_p (base_1, base_2)
17016       || !rtx_equal_p (base_2, base_3)
17017       || !rtx_equal_p (base_3, base_4))
17018     return false;
17019
17020   offval_1 = INTVAL (offset_1);
17021   offval_2 = INTVAL (offset_2);
17022   offval_3 = INTVAL (offset_3);
17023   offval_4 = INTVAL (offset_4);
17024   msize = GET_MODE_SIZE (mode);
17025   /* Check if the offsets are consecutive.  */
17026   if ((offval_1 != (offval_2 + msize)
17027        || offval_1 != (offval_3 + msize * 2)
17028        || offval_1 != (offval_4 + msize * 3))
17029       && (offval_4 != (offval_3 + msize)
17030           || offval_4 != (offval_2 + msize * 2)
17031           || offval_4 != (offval_1 + msize * 3)))
17032     return false;
17033
17034   /* Check if the addresses are clobbered by load.  */
17035   if (load)
17036     {
17037       if (reg_mentioned_p (reg_1, mem_1)
17038           || reg_mentioned_p (reg_2, mem_2)
17039           || reg_mentioned_p (reg_3, mem_3))
17040         return false;
17041
17042       /* In increasing order, the last load can clobber the address.  */
17043       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
17044         return false;
17045     }
17046
17047   /* If we have SImode and slow unaligned ldp,
17048      check the alignment to be at least 8 byte. */
17049   if (mode == SImode
17050       && (aarch64_tune_params.extra_tuning_flags
17051           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17052       && !optimize_size
17053       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17054     return false;
17055
17056   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17057     rclass_1 = FP_REGS;
17058   else
17059     rclass_1 = GENERAL_REGS;
17060
17061   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17062     rclass_2 = FP_REGS;
17063   else
17064     rclass_2 = GENERAL_REGS;
17065
17066   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17067     rclass_3 = FP_REGS;
17068   else
17069     rclass_3 = GENERAL_REGS;
17070
17071   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17072     rclass_4 = FP_REGS;
17073   else
17074     rclass_4 = GENERAL_REGS;
17075
17076   /* Check if the registers are of same class.  */
17077   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17078     return false;
17079
17080   return true;
17081 }
17082
17083 /* Given OPERANDS of consecutive load/store, this function pairs them
17084    into ldp/stp after adjusting the offset.  It depends on the fact
17085    that addresses of load/store instructions are in increasing order.
17086    MODE is the mode of memory operands.  CODE is the rtl operator
17087    which should be applied to all memory operands, it's SIGN_EXTEND,
17088    ZERO_EXTEND or UNKNOWN.  */
17089
17090 bool
17091 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17092                              scalar_mode mode, RTX_CODE code)
17093 {
17094   rtx base, offset_1, offset_2, t1, t2;
17095   rtx mem_1, mem_2, mem_3, mem_4;
17096   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
17097
17098   if (load)
17099     {
17100       mem_1 = operands[1];
17101       mem_2 = operands[3];
17102     }
17103   else
17104     {
17105       mem_1 = operands[0];
17106       mem_2 = operands[2];
17107     }
17108
17109   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17110   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17111   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17112               && offset_2 != NULL_RTX);
17113
17114   if (INTVAL (offset_1) > INTVAL (offset_2))
17115     {
17116       std::swap (operands[0], operands[6]);
17117       std::swap (operands[1], operands[7]);
17118       std::swap (operands[2], operands[4]);
17119       std::swap (operands[3], operands[5]);
17120     }
17121
17122   if (load)
17123     {
17124       mem_1 = operands[1];
17125       mem_2 = operands[3];
17126       mem_3 = operands[5];
17127       mem_4 = operands[7];
17128     }
17129   else
17130     {
17131       mem_1 = operands[0];
17132       mem_2 = operands[2];
17133       mem_3 = operands[4];
17134       mem_4 = operands[6];
17135       gcc_assert (code == UNKNOWN);
17136     }
17137
17138   /* Extract the offset of the new first address.  */
17139   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17140   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17141
17142   /* Adjust offset thus it can fit in ldp/stp instruction.  */
17143   msize = GET_MODE_SIZE (mode);
17144   stp_off_limit = msize * 0x40;
17145   off_val = INTVAL (offset_1);
17146   abs_off = (off_val < 0) ? -off_val : off_val;
17147   new_off = abs_off % stp_off_limit;
17148   adj_off = abs_off - new_off;
17149
17150   /* Further adjust to make sure all offsets are OK.  */
17151   if ((new_off + msize * 2) >= stp_off_limit)
17152     {
17153       adj_off += stp_off_limit;
17154       new_off -= stp_off_limit;
17155     }
17156
17157   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
17158   if (adj_off >= 0x1000)
17159     return false;
17160
17161   if (off_val < 0)
17162     {
17163       adj_off = -adj_off;
17164       new_off = -new_off;
17165     }
17166
17167   /* Create new memory references.  */
17168   mem_1 = change_address (mem_1, VOIDmode,
17169                           plus_constant (DImode, operands[8], new_off));
17170
17171   /* Check if the adjusted address is OK for ldp/stp.  */
17172   if (!aarch64_mem_pair_operand (mem_1, mode))
17173     return false;
17174
17175   msize = GET_MODE_SIZE (mode);
17176   mem_2 = change_address (mem_2, VOIDmode,
17177                           plus_constant (DImode,
17178                                          operands[8],
17179                                          new_off + msize));
17180   mem_3 = change_address (mem_3, VOIDmode,
17181                           plus_constant (DImode,
17182                                          operands[8],
17183                                          new_off + msize * 2));
17184   mem_4 = change_address (mem_4, VOIDmode,
17185                           plus_constant (DImode,
17186                                          operands[8],
17187                                          new_off + msize * 3));
17188
17189   if (code == ZERO_EXTEND)
17190     {
17191       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17192       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17193       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17194       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17195     }
17196   else if (code == SIGN_EXTEND)
17197     {
17198       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17199       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17200       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17201       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17202     }
17203
17204   if (load)
17205     {
17206       operands[1] = mem_1;
17207       operands[3] = mem_2;
17208       operands[5] = mem_3;
17209       operands[7] = mem_4;
17210     }
17211   else
17212     {
17213       operands[0] = mem_1;
17214       operands[2] = mem_2;
17215       operands[4] = mem_3;
17216       operands[6] = mem_4;
17217     }
17218
17219   /* Emit adjusting instruction.  */
17220   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17221   /* Emit ldp/stp instructions.  */
17222   t1 = gen_rtx_SET (operands[0], operands[1]);
17223   t2 = gen_rtx_SET (operands[2], operands[3]);
17224   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17225   t1 = gen_rtx_SET (operands[4], operands[5]);
17226   t2 = gen_rtx_SET (operands[6], operands[7]);
17227   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17228   return true;
17229 }
17230
17231 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17232    it isn't worth branching around empty masked ops (including masked
17233    stores).  */
17234
17235 static bool
17236 aarch64_empty_mask_is_expensive (unsigned)
17237 {
17238   return false;
17239 }
17240
17241 /* Return 1 if pseudo register should be created and used to hold
17242    GOT address for PIC code.  */
17243
17244 bool
17245 aarch64_use_pseudo_pic_reg (void)
17246 {
17247   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17248 }
17249
17250 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17251
17252 static int
17253 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17254 {
17255   switch (XINT (x, 1))
17256     {
17257     case UNSPEC_GOTSMALLPIC:
17258     case UNSPEC_GOTSMALLPIC28K:
17259     case UNSPEC_GOTTINYPIC:
17260       return 0;
17261     default:
17262       break;
17263     }
17264
17265   return default_unspec_may_trap_p (x, flags);
17266 }
17267
17268
17269 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17270    return the log2 of that value.  Otherwise return -1.  */
17271
17272 int
17273 aarch64_fpconst_pow_of_2 (rtx x)
17274 {
17275   const REAL_VALUE_TYPE *r;
17276
17277   if (!CONST_DOUBLE_P (x))
17278     return -1;
17279
17280   r = CONST_DOUBLE_REAL_VALUE (x);
17281
17282   if (REAL_VALUE_NEGATIVE (*r)
17283       || REAL_VALUE_ISNAN (*r)
17284       || REAL_VALUE_ISINF (*r)
17285       || !real_isinteger (r, DFmode))
17286     return -1;
17287
17288   return exact_log2 (real_to_integer (r));
17289 }
17290
17291 /* If X is a vector of equal CONST_DOUBLE values and that value is
17292    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17293
17294 int
17295 aarch64_vec_fpconst_pow_of_2 (rtx x)
17296 {
17297   int nelts;
17298   if (GET_CODE (x) != CONST_VECTOR
17299       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17300     return -1;
17301
17302   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17303     return -1;
17304
17305   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17306   if (firstval <= 0)
17307     return -1;
17308
17309   for (int i = 1; i < nelts; i++)
17310     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17311       return -1;
17312
17313   return firstval;
17314 }
17315
17316 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17317    to float.
17318
17319    __fp16 always promotes through this hook.
17320    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17321    through the generic excess precision logic rather than here.  */
17322
17323 static tree
17324 aarch64_promoted_type (const_tree t)
17325 {
17326   if (SCALAR_FLOAT_TYPE_P (t)
17327       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17328     return float_type_node;
17329
17330   return NULL_TREE;
17331 }
17332
17333 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17334
17335 static bool
17336 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17337                            optimization_type opt_type)
17338 {
17339   switch (op)
17340     {
17341     case rsqrt_optab:
17342       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17343
17344     default:
17345       return true;
17346     }
17347 }
17348
17349 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17350
17351 static unsigned int
17352 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17353                                         int *offset)
17354 {
17355   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17356   gcc_assert (i == 1);
17357   *factor = 2;
17358   *offset = 1;
17359   return AARCH64_DWARF_VG;
17360 }
17361
17362 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17363    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17364
17365 static bool
17366 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17367 {
17368   return (mode == HFmode
17369           ? true
17370           : default_libgcc_floating_mode_supported_p (mode));
17371 }
17372
17373 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17374    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17375
17376 static bool
17377 aarch64_scalar_mode_supported_p (scalar_mode mode)
17378 {
17379   return (mode == HFmode
17380           ? true
17381           : default_scalar_mode_supported_p (mode));
17382 }
17383
17384 /* Set the value of FLT_EVAL_METHOD.
17385    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17386
17387     0: evaluate all operations and constants, whose semantic type has at
17388        most the range and precision of type float, to the range and
17389        precision of float; evaluate all other operations and constants to
17390        the range and precision of the semantic type;
17391
17392     N, where _FloatN is a supported interchange floating type
17393        evaluate all operations and constants, whose semantic type has at
17394        most the range and precision of _FloatN type, to the range and
17395        precision of the _FloatN type; evaluate all other operations and
17396        constants to the range and precision of the semantic type;
17397
17398    If we have the ARMv8.2-A extensions then we support _Float16 in native
17399    precision, so we should set this to 16.  Otherwise, we support the type,
17400    but want to evaluate expressions in float precision, so set this to
17401    0.  */
17402
17403 static enum flt_eval_method
17404 aarch64_excess_precision (enum excess_precision_type type)
17405 {
17406   switch (type)
17407     {
17408       case EXCESS_PRECISION_TYPE_FAST:
17409       case EXCESS_PRECISION_TYPE_STANDARD:
17410         /* We can calculate either in 16-bit range and precision or
17411            32-bit range and precision.  Make that decision based on whether
17412            we have native support for the ARMv8.2-A 16-bit floating-point
17413            instructions or not.  */
17414         return (TARGET_FP_F16INST
17415                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17416                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17417       case EXCESS_PRECISION_TYPE_IMPLICIT:
17418         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17419       default:
17420         gcc_unreachable ();
17421     }
17422   return FLT_EVAL_METHOD_UNPREDICTABLE;
17423 }
17424
17425 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17426    scheduled for speculative execution.  Reject the long-running division
17427    and square-root instructions.  */
17428
17429 static bool
17430 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17431 {
17432   switch (get_attr_type (insn))
17433     {
17434       case TYPE_SDIV:
17435       case TYPE_UDIV:
17436       case TYPE_FDIVS:
17437       case TYPE_FDIVD:
17438       case TYPE_FSQRTS:
17439       case TYPE_FSQRTD:
17440       case TYPE_NEON_FP_SQRT_S:
17441       case TYPE_NEON_FP_SQRT_D:
17442       case TYPE_NEON_FP_SQRT_S_Q:
17443       case TYPE_NEON_FP_SQRT_D_Q:
17444       case TYPE_NEON_FP_DIV_S:
17445       case TYPE_NEON_FP_DIV_D:
17446       case TYPE_NEON_FP_DIV_S_Q:
17447       case TYPE_NEON_FP_DIV_D_Q:
17448         return false;
17449       default:
17450         return true;
17451     }
17452 }
17453
17454 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17455
17456 static int
17457 aarch64_compute_pressure_classes (reg_class *classes)
17458 {
17459   int i = 0;
17460   classes[i++] = GENERAL_REGS;
17461   classes[i++] = FP_REGS;
17462   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17463      registers need to go in PR_LO_REGS at some point during their
17464      lifetime.  Splitting it into two halves has the effect of making
17465      all predicates count against PR_LO_REGS, so that we try whenever
17466      possible to restrict the number of live predicates to 8.  This
17467      greatly reduces the amount of spilling in certain loops.  */
17468   classes[i++] = PR_LO_REGS;
17469   classes[i++] = PR_HI_REGS;
17470   return i;
17471 }
17472
17473 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17474
17475 static bool
17476 aarch64_can_change_mode_class (machine_mode from,
17477                                machine_mode to, reg_class_t)
17478 {
17479   if (BYTES_BIG_ENDIAN)
17480     {
17481       bool from_sve_p = aarch64_sve_data_mode_p (from);
17482       bool to_sve_p = aarch64_sve_data_mode_p (to);
17483
17484       /* Don't allow changes between SVE data modes and non-SVE modes.
17485          See the comment at the head of aarch64-sve.md for details.  */
17486       if (from_sve_p != to_sve_p)
17487         return false;
17488
17489       /* Don't allow changes in element size: lane 0 of the new vector
17490          would not then be lane 0 of the old vector.  See the comment
17491          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17492          description.
17493
17494          In the worst case, this forces a register to be spilled in
17495          one mode and reloaded in the other, which handles the
17496          endianness correctly.  */
17497       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17498         return false;
17499     }
17500   return true;
17501 }
17502
17503 /* Implement TARGET_EARLY_REMAT_MODES.  */
17504
17505 static void
17506 aarch64_select_early_remat_modes (sbitmap modes)
17507 {
17508   /* SVE values are not normally live across a call, so it should be
17509      worth doing early rematerialization even in VL-specific mode.  */
17510   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17511     {
17512       machine_mode mode = (machine_mode) i;
17513       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17514       if (vec_flags & VEC_ANY_SVE)
17515         bitmap_set_bit (modes, i);
17516     }
17517 }
17518
17519 /* Target-specific selftests.  */
17520
17521 #if CHECKING_P
17522
17523 namespace selftest {
17524
17525 /* Selftest for the RTL loader.
17526    Verify that the RTL loader copes with a dump from
17527    print_rtx_function.  This is essentially just a test that class
17528    function_reader can handle a real dump, but it also verifies
17529    that lookup_reg_by_dump_name correctly handles hard regs.
17530    The presence of hard reg names in the dump means that the test is
17531    target-specific, hence it is in this file.  */
17532
17533 static void
17534 aarch64_test_loading_full_dump ()
17535 {
17536   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17537
17538   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17539
17540   rtx_insn *insn_1 = get_insn_by_uid (1);
17541   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17542
17543   rtx_insn *insn_15 = get_insn_by_uid (15);
17544   ASSERT_EQ (INSN, GET_CODE (insn_15));
17545   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17546
17547   /* Verify crtl->return_rtx.  */
17548   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17549   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17550   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17551 }
17552
17553 /* Run all target-specific selftests.  */
17554
17555 static void
17556 aarch64_run_selftests (void)
17557 {
17558   aarch64_test_loading_full_dump ();
17559 }
17560
17561 } // namespace selftest
17562
17563 #endif /* #if CHECKING_P */
17564
17565 #undef TARGET_ADDRESS_COST
17566 #define TARGET_ADDRESS_COST aarch64_address_cost
17567
17568 /* This hook will determines whether unnamed bitfields affect the alignment
17569    of the containing structure.  The hook returns true if the structure
17570    should inherit the alignment requirements of an unnamed bitfield's
17571    type.  */
17572 #undef TARGET_ALIGN_ANON_BITFIELD
17573 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17574
17575 #undef TARGET_ASM_ALIGNED_DI_OP
17576 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17577
17578 #undef TARGET_ASM_ALIGNED_HI_OP
17579 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17580
17581 #undef TARGET_ASM_ALIGNED_SI_OP
17582 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17583
17584 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17585 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17586   hook_bool_const_tree_hwi_hwi_const_tree_true
17587
17588 #undef TARGET_ASM_FILE_START
17589 #define TARGET_ASM_FILE_START aarch64_start_file
17590
17591 #undef TARGET_ASM_OUTPUT_MI_THUNK
17592 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17593
17594 #undef TARGET_ASM_SELECT_RTX_SECTION
17595 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17596
17597 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17598 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17599
17600 #undef TARGET_BUILD_BUILTIN_VA_LIST
17601 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17602
17603 #undef TARGET_CALLEE_COPIES
17604 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17605
17606 #undef TARGET_CAN_ELIMINATE
17607 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17608
17609 #undef TARGET_CAN_INLINE_P
17610 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17611
17612 #undef TARGET_CANNOT_FORCE_CONST_MEM
17613 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17614
17615 #undef TARGET_CASE_VALUES_THRESHOLD
17616 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17617
17618 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17619 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17620
17621 /* Only the least significant bit is used for initialization guard
17622    variables.  */
17623 #undef TARGET_CXX_GUARD_MASK_BIT
17624 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17625
17626 #undef TARGET_C_MODE_FOR_SUFFIX
17627 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17628
17629 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17630 #undef  TARGET_DEFAULT_TARGET_FLAGS
17631 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17632 #endif
17633
17634 #undef TARGET_CLASS_MAX_NREGS
17635 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17636
17637 #undef TARGET_BUILTIN_DECL
17638 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17639
17640 #undef TARGET_BUILTIN_RECIPROCAL
17641 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17642
17643 #undef TARGET_C_EXCESS_PRECISION
17644 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17645
17646 #undef  TARGET_EXPAND_BUILTIN
17647 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17648
17649 #undef TARGET_EXPAND_BUILTIN_VA_START
17650 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17651
17652 #undef TARGET_FOLD_BUILTIN
17653 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17654
17655 #undef TARGET_FUNCTION_ARG
17656 #define TARGET_FUNCTION_ARG aarch64_function_arg
17657
17658 #undef TARGET_FUNCTION_ARG_ADVANCE
17659 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17660
17661 #undef TARGET_FUNCTION_ARG_BOUNDARY
17662 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17663
17664 #undef TARGET_FUNCTION_ARG_PADDING
17665 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17666
17667 #undef TARGET_GET_RAW_RESULT_MODE
17668 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17669 #undef TARGET_GET_RAW_ARG_MODE
17670 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17671
17672 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17673 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17674
17675 #undef TARGET_FUNCTION_VALUE
17676 #define TARGET_FUNCTION_VALUE aarch64_function_value
17677
17678 #undef TARGET_FUNCTION_VALUE_REGNO_P
17679 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17680
17681 #undef TARGET_GIMPLE_FOLD_BUILTIN
17682 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17683
17684 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17685 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17686
17687 #undef  TARGET_INIT_BUILTINS
17688 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17689
17690 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17691 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17692   aarch64_ira_change_pseudo_allocno_class
17693
17694 #undef TARGET_LEGITIMATE_ADDRESS_P
17695 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17696
17697 #undef TARGET_LEGITIMATE_CONSTANT_P
17698 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17699
17700 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17701 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17702   aarch64_legitimize_address_displacement
17703
17704 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17705 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17706
17707 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17708 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17709 aarch64_libgcc_floating_mode_supported_p
17710
17711 #undef TARGET_MANGLE_TYPE
17712 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17713
17714 #undef TARGET_MEMORY_MOVE_COST
17715 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17716
17717 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17718 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17719
17720 #undef TARGET_MUST_PASS_IN_STACK
17721 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17722
17723 /* This target hook should return true if accesses to volatile bitfields
17724    should use the narrowest mode possible.  It should return false if these
17725    accesses should use the bitfield container type.  */
17726 #undef TARGET_NARROW_VOLATILE_BITFIELD
17727 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17728
17729 #undef  TARGET_OPTION_OVERRIDE
17730 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17731
17732 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17733 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17734   aarch64_override_options_after_change
17735
17736 #undef TARGET_OPTION_SAVE
17737 #define TARGET_OPTION_SAVE aarch64_option_save
17738
17739 #undef TARGET_OPTION_RESTORE
17740 #define TARGET_OPTION_RESTORE aarch64_option_restore
17741
17742 #undef TARGET_OPTION_PRINT
17743 #define TARGET_OPTION_PRINT aarch64_option_print
17744
17745 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17746 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17747
17748 #undef TARGET_SET_CURRENT_FUNCTION
17749 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17750
17751 #undef TARGET_PASS_BY_REFERENCE
17752 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17753
17754 #undef TARGET_PREFERRED_RELOAD_CLASS
17755 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17756
17757 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17758 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17759
17760 #undef TARGET_PROMOTED_TYPE
17761 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17762
17763 #undef TARGET_SECONDARY_RELOAD
17764 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17765
17766 #undef TARGET_SHIFT_TRUNCATION_MASK
17767 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17768
17769 #undef TARGET_SETUP_INCOMING_VARARGS
17770 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17771
17772 #undef TARGET_STRUCT_VALUE_RTX
17773 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17774
17775 #undef TARGET_REGISTER_MOVE_COST
17776 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17777
17778 #undef TARGET_RETURN_IN_MEMORY
17779 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17780
17781 #undef TARGET_RETURN_IN_MSB
17782 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17783
17784 #undef TARGET_RTX_COSTS
17785 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17786
17787 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17788 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17789
17790 #undef TARGET_SCHED_ISSUE_RATE
17791 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17792
17793 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17794 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17795   aarch64_sched_first_cycle_multipass_dfa_lookahead
17796
17797 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17798 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17799   aarch64_first_cycle_multipass_dfa_lookahead_guard
17800
17801 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17802 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17803   aarch64_get_separate_components
17804
17805 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17806 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17807   aarch64_components_for_bb
17808
17809 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17810 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17811   aarch64_disqualify_components
17812
17813 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17814 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17815   aarch64_emit_prologue_components
17816
17817 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17818 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17819   aarch64_emit_epilogue_components
17820
17821 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17822 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17823   aarch64_set_handled_components
17824
17825 #undef TARGET_TRAMPOLINE_INIT
17826 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17827
17828 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17829 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17830
17831 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17832 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17833
17834 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17835 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17836   aarch64_builtin_support_vector_misalignment
17837
17838 #undef TARGET_ARRAY_MODE
17839 #define TARGET_ARRAY_MODE aarch64_array_mode
17840
17841 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17842 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17843
17844 #undef TARGET_VECTORIZE_ADD_STMT_COST
17845 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17846
17847 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17848 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17849   aarch64_builtin_vectorization_cost
17850
17851 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17852 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17853
17854 #undef TARGET_VECTORIZE_BUILTINS
17855 #define TARGET_VECTORIZE_BUILTINS
17856
17857 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17858 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17859   aarch64_builtin_vectorized_function
17860
17861 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17862 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17863   aarch64_autovectorize_vector_sizes
17864
17865 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17866 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17867   aarch64_atomic_assign_expand_fenv
17868
17869 /* Section anchor support.  */
17870
17871 #undef TARGET_MIN_ANCHOR_OFFSET
17872 #define TARGET_MIN_ANCHOR_OFFSET -256
17873
17874 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17875    byte offset; we can do much more for larger data types, but have no way
17876    to determine the size of the access.  We assume accesses are aligned.  */
17877 #undef TARGET_MAX_ANCHOR_OFFSET
17878 #define TARGET_MAX_ANCHOR_OFFSET 4095
17879
17880 #undef TARGET_VECTOR_ALIGNMENT
17881 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17882
17883 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17884 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17885   aarch64_vectorize_preferred_vector_alignment
17886 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17887 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17888   aarch64_simd_vector_alignment_reachable
17889
17890 /* vec_perm support.  */
17891
17892 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17893 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17894   aarch64_vectorize_vec_perm_const
17895
17896 #undef TARGET_VECTORIZE_GET_MASK_MODE
17897 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17898 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17899 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17900   aarch64_empty_mask_is_expensive
17901
17902 #undef TARGET_INIT_LIBFUNCS
17903 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17904
17905 #undef TARGET_FIXED_CONDITION_CODE_REGS
17906 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17907
17908 #undef TARGET_FLAGS_REGNUM
17909 #define TARGET_FLAGS_REGNUM CC_REGNUM
17910
17911 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17912 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17913
17914 #undef TARGET_ASAN_SHADOW_OFFSET
17915 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17916
17917 #undef TARGET_LEGITIMIZE_ADDRESS
17918 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17919
17920 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17921 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17922
17923 #undef TARGET_CAN_USE_DOLOOP_P
17924 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17925
17926 #undef TARGET_SCHED_ADJUST_PRIORITY
17927 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17928
17929 #undef TARGET_SCHED_MACRO_FUSION_P
17930 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17931
17932 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17933 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17934
17935 #undef TARGET_SCHED_FUSION_PRIORITY
17936 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17937
17938 #undef TARGET_UNSPEC_MAY_TRAP_P
17939 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17940
17941 #undef TARGET_USE_PSEUDO_PIC_REG
17942 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17943
17944 #undef TARGET_PRINT_OPERAND
17945 #define TARGET_PRINT_OPERAND aarch64_print_operand
17946
17947 #undef TARGET_PRINT_OPERAND_ADDRESS
17948 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17949
17950 #undef TARGET_OPTAB_SUPPORTED_P
17951 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17952
17953 #undef TARGET_OMIT_STRUCT_RETURN_REG
17954 #define TARGET_OMIT_STRUCT_RETURN_REG true
17955
17956 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17957 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17958   aarch64_dwarf_poly_indeterminate_value
17959
17960 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17961 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17962 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17963
17964 #undef TARGET_HARD_REGNO_NREGS
17965 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17966 #undef TARGET_HARD_REGNO_MODE_OK
17967 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17968
17969 #undef TARGET_MODES_TIEABLE_P
17970 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17971
17972 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17973 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17974   aarch64_hard_regno_call_part_clobbered
17975
17976 #undef TARGET_CONSTANT_ALIGNMENT
17977 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17978
17979 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17980 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17981
17982 #undef TARGET_CAN_CHANGE_MODE_CLASS
17983 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17984
17985 #undef TARGET_SELECT_EARLY_REMAT_MODES
17986 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17987
17988 #if CHECKING_P
17989 #undef TARGET_RUN_TARGET_SELFTESTS
17990 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17991 #endif /* #if CHECKING_P */
17992
17993 struct gcc_target targetm = TARGET_INITIALIZER;
17994
17995 #include "gt-aarch64.h"