This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH 2/2, AArch64] Pipeline model for APM XGene-1.

From: Kyrill Tkachov <kyrylo dot tkachov at arm dot com>
To: Philipp Tomsich <philipp dot tomsich at theobroma-systems dot com>, "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
Cc: "marcus dot shawcroft at gmail dot com" <marcus dot shawcroft at gmail dot com>, "benedikt dot huber at theobroma-systems dot com" <benedikt dot huber at theobroma-systems dot com>, "ksankaran at apm dot com" <ksankaran at apm dot com>
Date: Wed, 19 Nov 2014 18:02:18 +0000
Subject: Re: [PATCH 2/2, AArch64] Pipeline model for APM XGene-1.
Authentication-results: sourceware.org; auth=none
References: <1416418333-47433-1-git-send-email-philipp dot tomsich at theobroma-systems dot com> <1416418333-47433-3-git-send-email-philipp dot tomsich at theobroma-systems dot com>

Hi Philipp,

Some comments inline

On 19/11/14 17:32, Philipp Tomsich wrote:

---
  gcc/ChangeLog                 |   6 +
  gcc/config/aarch64/aarch64.md |   4 +-
  gcc/config/arm/xgene1.md      | 739 ++++++++++++++++++++++++++++++++++++++++++
  3 files changed, 748 insertions(+), 1 deletion(-)
  create mode 100644 gcc/config/arm/xgene1.md

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5b389c5..9cc3b5a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,11 @@
  2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

+       * config/aarch64/aarch64.md: Include xgene1.md.
+       (generic_sched): Set to no for xgene1.
+       * config/arm/xgene1.md: New file.
+
+2014-11-19  Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
         * config/aarch64/aarch64-cores.def (xgene1): Update/add the
         xgene1 (APM XGene-1) core definition.
         * gcc/config/aarch64/aarch64.c: Add cost tables for APM XGene-1
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 597ff8c..5d92051 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -191,7 +191,7 @@

  (define_attr "generic_sched" "yes,no"
    (const (if_then_else
-          (eq_attr "tune" "cortexa53,cortexa15,thunderx")
+          (eq_attr "tune" "cortexa53,cortexa15,thunderx,xgene1")
            (const_string "no")
            (const_string "yes"))))

@@ -4211,3 +4211,5 @@

  ;; Atomic Operations
  (include "atomics.md")
+
+(include "../arm/xgene1.md")
diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
new file mode 100644
index 0000000..3c08b16
--- /dev/null
+++ b/gcc/config/arm/xgene1.md
@@ -0,0 +1,739 @@
+;; Machine description for AppliedMicro xgene1 core.
+;; Copyright (C) 2012 Free Software Foundation, Inc.


Copyright 2012-2014 I think...

+;; Contributed by Theobroma Systems Design und Consulting GmbH.
+;;                See http://www.theobroma-systems.com for more info.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; Pipeline description for the xgene1 micro-architecture
+
+(define_automaton "xgene1")
+
+(define_cpu_unit "decode_out_0" "xgene1")
+(define_cpu_unit "decode_out_1" "xgene1")
+(define_cpu_unit "decode_out_2" "xgene1")
+(define_cpu_unit "decode_out_3" "xgene1")
+
+(define_cpu_unit "divide" "xgene1")
+(define_cpu_unit "fp_divide" "xgene1")
+
+(define_reservation "decode1op"
+        "( decode_out_0 )
+        |( decode_out_1 )
+        |( decode_out_2 )
+        |( decode_out_3 )"
+)
+(define_reservation "decode2op"
+        "( decode_out_0 + decode_out_1 )
+        |( decode_out_0 + decode_out_2 )
+        |( decode_out_0 + decode_out_3 )
+        |( decode_out_1 + decode_out_2 )
+        |( decode_out_1 + decode_out_3 )
+        |( decode_out_2 + decode_out_3 )"
+)
+(define_reservation "decodeIsolated"
+        "( decode_out_0 + decode_out_1 + decode_out_2 + decode_out_3 )"
+)
+
+;; (define_insn_reservation "dummy" 1
+;;   (and (eq_attr "tune" "xgene1")
+;;        (eq_attr "type" "neon_minmax"))
+;;   "decodeIsolated")


Remove that commented out unit.

+
+;; B: nop.
+;; BR: branch op.
+
+;; RET
+;; CBZ
+;; TBZ
+(define_insn_reservation "branch" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "branch"))
+  "decode1op")
+
+;; NOP
+;; HINT
+(define_insn_reservation "nop" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "no_insn"))
+  "decode1op")
+
+;; See #3565

This is not meaningful, it should be removed. Similarly elsewhere in thefile.

+;; BLR: arithmetic op & branch op.
+;; BL: arithmetic op.
+(define_insn_reservation "call" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "call"))
+  "decode2op")
+
+;; LDR: FP load op & arithmetic op.
+(define_insn_reservation "f_load" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_loadd,f_loads"))
+  "decode2op,nothing*9")

you can just write "decode2op". The nothing*9 is implicit if you don'tspecify nothing.Using the 'nothing' is only useful if you wanted to model thereservation of a unit with intermediate 'empty stages' like:

"unit0, nothing*2, unit1". Similarly throughout the file.

+
+;; STR: FP store op & arithmetic op.
+(define_insn_reservation "f_store" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_stored,f_stores"))
+  "decode2op,nothing*3")
+
+;; FMOV (immediate): FP move op.
+;; FMOV (register): FP move op.
+(define_insn_reservation "fmov" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fmov,fconsts,fconstd"))
+  "decode1op,nothing")
+
+;; LDP: FP load op & FP load op.
+;; LDP: FP load op & FP load op & arithmetic op.
+(define_insn_reservation "f_mcr" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_mcr"))
+  "decodeIsolated,nothing*9")
+
+;; STP: FP store op & FP store op.
+(define_insn_reservation "f_mrc" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_mrc"))
+  "decode2op,nothing*3")
+
+;; The (register offset) instructions with a shift
+;; of #0, #2, or #3 (or no shift) are translated
+;; as shown.
+;; For these instructions, any other shift amount
+;; causes the instruction be prefixed with an
+;; sbfm/ubfm op (1 cycle latency).
+
+;; Load/store register pair (post-indexed):
+;; LDP: load op & load op & arithmetic op.
+;; Load/store register pair (offset):
+;; LDP: load op & load op.
+;; Load/store register pair (pre-indexed):
+;; LDP: load op & load op & arithmetic op.
+;; 5 + 1
+(define_insn_reservation "load_pair" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "load2"))
+  "decodeIsolated,nothing*5")
+
+;; Load/store register pair (post-indexed):
+;; STP: store op & store op & arithmetic op.
+;; Load/store register pair (offset):
+;; STP: store op & store op.
+;; Load/store register pair (pre-indexed):
+;; STP: store op & store op & arithmetic op.
+;; 1 + 1
+(define_insn_reservation "store_pair" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "store2"))
+  "decodeIsolated,nothing")
+
+;; Load register (literal):
+;; LDR: load op.
+;; Load/store register (immediate post-indexed):
+;; LDRB/LDRH/LDR: load op & arithmetic op.
+;; Load/store register (immediate pre-indexed):
+;; LDRB/LDRH/LDR: load op & arithmetic op.
+;; Load/store register (register offset)
+;; DRB/LDRH/LDR: load op.
+;; LDRSB/LDRSH/LDRSW: load op + sbfm op (1 cycle latency).
+;; Load/store register (unsigned immediate):
+;; LDRB/LDRH/LDR: load op.
+;; 5 + 1
+;; FIXME This is inaccurate but avoids a crash.
+(define_insn_reservation "load1" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "load1"))
+  "decode2op,nothing")
+
+;; Load/store register (immediate post-indexed):
+;; STRB/STRH/STR: store op & arithmetic op.
+;; Load/store register (immediate pre-indexed):
+;; STRB/STRH/STR: store op & arithmetic op.
+;; Load/store register (register offset)
+;; STRB/STRH/STR: store op.
+;; Load/store register (unsigned immediate):
+;; STRB/STRH/STR: store op.
+;; 1 + 1
+(define_insn_reservation "store1" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "store1"))
+  "decode2op,nothing")
+
+;; MOVI
+;; MOV
+;; Move wide: logical op.
+;; MRS NZCV: logical op (register result).
+(define_insn_reservation "move" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "mov_reg,mov_imm,mrs"))
+  "decode1op")
+
+;; See #3565
+(define_insn_reservation "alu" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "alu_imm,alu_sreg,alu_shift_imm,\
+                        alu_ext,adc_reg,csel,logic_imm,\
+                        logic_reg,logic_shift_imm,clz,\
+                        rbit,shift_reg,adr,mov_reg,\
+                        mov_imm,extend"))
+  "decode1op")
+
+;; REV/REV16/REV32: SIMD op.
+(define_insn_reservation "simd" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "rev"))
+  "decode1op")
+
+;; See #3565
+(define_insn_reservation "alus" 1
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "alus_imm,alu_sreg,alus_shift_imm,\
+                        alus_ext,logics_imm,logics_reg,\
+                        logics_shift_imm"))
+  "decode1op")
+
+;; MADD/SMADDL/UMADDL with Ra=XZR/WZR: multiply op.
+;; MADD/SMADDL/UMADDL with other Ra: multiply op + arithmetic op.
+;; MSUB/SMSUBL/UMSUBL: multiply op + arithmetic op.
+;; SMULH/UMULH: multiply op.
+;; 5 + 1
+(define_insn_reservation "mul" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "mul,mla,smull,umull,smlal,umlal"))
+  "decode2op,nothing*5")
+
+;; UDIV/SDIV: divide op.
+(define_insn_reservation "div" 66
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "sdiv,udiv"))
+  "decode1op,divide*65")

Such large reservations tend to blow up the state-space of the automatonwithout contributing too much to the codegen quality.

See PR 60743 for an example where it bit us.
Is the automaton size reasonable here?
You can get a feel of how large the automaton becomes by adding the options:
 (automata_option "v")
 (automata_option "time")
 (automata_option "stats")
 (automata_option "progress")

to the .md file and it will show you some stats during genautomata.

+
+;; FCMP/FCMPE: FP compare op.
+(define_insn_reservation "fcmp" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fcmpd,fcmps"))
+  "decode1op,nothing*11")
+
+;; FCSEL: FP select op
+(define_insn_reservation "fcsel" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fcsel"))
+  "decode1op,nothing*2")
+
+;; See #3565
+(define_insn_reservation "bfm" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "bfm"))
+  "decode1op,nothing")
+
+;; FRINTN/FRINTP/FRINTM/FRINTZ/FRINTA/FRINTX/FRINTI:
+;; FP convert op
+(define_insn_reservation "f_rint" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_rintd,f_rints"))
+  "decode1op,nothing*4")
+
+;; FCVT (single to double or double to single): FP arithmetic op.
+;; FCVT (to or from half precision): FP half cvt op.
+(define_insn_reservation "f_cvt" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_cvt"))
+  "decode1op,nothing*2")
+
+;; Floating-point<->integer conversions:
+;; FCVTNS/FCVTAS/FCVTPS/FCVTMS: FP convert op + FP store op (data bypass path) + (integer) load op.
+;; FCVTNU/FCVTAU/FCVTPU/FCVTMU: FP convert op + FP store op (data bypass path) + (integer) load op.
+;; FCVTZS/FCVTZU (integer): FP convert op + FP store op (data bypass path) + (integer) load op.
+;; Floating-point<->fixed-point conversions:
+;; FCVTZS/FCVTZU (fixed-point): FP convert op + FP store op (data bypass path) + (integer) load op.
+;; 5 + 1 + 5
+(define_insn_reservation "f_cvtf2i" 11
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_cvtf2i"))
+  "decodeIsolated,nothing*10")
+
+;; SCVTF/UCVTF (integer): (integer) store op (data bypass path) + FP load op + FP arithmetic op.
+;; SCVTF/UCVTF (fixed-point): (integer) store op (data bypass path) + FP load op + FP arithmetic op.
+;; -1 + 10 + 5
+(define_insn_reservation "f_cvti2f" 14
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_cvti2f"))
+  "decodeIsolated,nothing*13")
+
+;; FMUL/FADD/FSUB/FNMUL: FP arithmetic op.
+(define_insn_reservation "f_add" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "faddd,fadds,fmuld,fmuls"))
+  "decode1op,nothing*5")
+
+;; FDIV: FP divide op.
+(define_insn_reservation "f_div" 28
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fdivd,fdivs"))
+  "decode1op,fp_divide*27")
+
+;; FABS/FNEG: FP move op.
+(define_insn_reservation "f_arith" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "ffarithd,ffariths"))
+  "decode1op,nothing")
+
+;; FSQRT: FP sqrt op.
+(define_insn_reservation "f_sqrt" 38
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "fsqrtd,fsqrts"))
+  "decode1op,fp_divide*37")
+
+;; FMAX/FMIN/FMAXNM/FMINNM: FP select op.
+(define_insn_reservation "f_select" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "f_minmaxd,f_minmaxs"))
+  "decode1op,nothing*2")
+
+
+;; SIMD (aka neon)
+
+;; DUP (element) (size=x1000): ASIMD logical op.
+;; DUP (element) (size=other): ASIMD shift op.
+(define_insn_reservation "neon_dup" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_dup,neon_dup_q"))
+  "decode1op,nothing*2")
+
+;; LDR: FP load op & arithmetic op.
+;; LD1 (one register, 1D): FP load op.
+;; LD1 (one register, 2D): FP load op*2.
+;; LD1 (one register, 2S/4H/8B): FP complex load op.
+;; LD1 (one register, 4S/8H/16B): FP complex load op*2.
+(define_insn_reservation "neon_load1" 11
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q"))
+  "decode2op,nothing*10")
+
+;; STR: FP store op & arithmetic op.
+;; ST1 (one register, 1D): FP store op
+;; ST1 (one register, 2D): FP store op*2
+;; ST1 (one register, 2S/4H/8B): FP complex store op
+;; ST1 (one register, 4S/8H/16B): FP complex store op*2
+;; 4 + 1
+(define_insn_reservation "neon_store1" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q"))
+  "decode2op,nothing*4")
+
+;; MOVI/MVNI/ORR/BIC/FMOV: ASIMD logical op^Q.
+;; AND/BIC/ORR/ORN/EOR/BSL/BIT/BIF: ASIMD logical op^Q.
+(define_insn_reservation "neon_logic" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_logic,\
+                        neon_logic_q,\
+                        neon_bsl,\
+                        neon_bsl_q,\
+                        neon_move,\
+                        neon_move_q,\
+                       "))
+  "decode1op,nothing")
+;; N.B. ^Q means that it only uses one decode slot.
+
+;; UMOV (imm5=xxx00): FP store op (data bypass path) + (integer) load op.
+;; UMOV (imm5=other): FP store op (data bypass path) + (integer) load op + ubfm op (1 cycle latency).
+;; 1 + 5 + 1
+(define_insn_reservation "neon_umov" 7
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_to_gp, neon_to_gp_q"))
+  "decodeIsolated,nothing*6")
+
+;; INS (element) (imm5=01000): FP move op.
+;; INS (element) (imm5=other): ASIMD shift op + ASIMD insert op.
+;; INS (general register) (imm5=01000): (integer) store op + FP load op.
+;; INS (general register) (imm5=other): (integer) store op + FP load op + ASIMD insert op.
+;; 1 + 10 + 3
+(define_insn_reservation "neon_ins" 14
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_from_gp,\
+                        neon_from_gp_q,\
+                        neon_ins,\
+                        neon_ins_q,\
+                       "))
+  "decodeIsolated,nothing*13")
+
+;; USHR/URSHR: ASIMD shift op.
+;; USHR/URSHR: ASIMD shift op^Q.

What are these ^Q, editor artifact?

+;; SSHR/SRSHR: ASIMD shift op.
+;; SSHR/SRSHR: ASIMD shift op^Q.
+;; SHL/SQSHL/SQSHLU: ASIMD shift op.
+;; SHL/SQSHL/SQSHLU: ASIMD shift op^Q.
+;; SSHL/SQSHL/SRSHL/SQRSHL: ASIMD shift op.
+;; SSHL/SQSHL/SRSHL/SQRSHL: ASIMD shift op^Q.
+;; USHL/UQSHL/URSHL/UQRSHL: ASIMD shift op.
+;; USHL/UQSHL/URSHL/UQRSHL: ASIMD shift op^Q.
+;; XTN/SQXTN/UQXTN/SQXTUN/SHLL: ASIMD shift op.
+;; SSHLL/USHLL: ASIMD shift op*2.
+(define_insn_reservation "neon_shift" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_shift_imm,\
+                        neon_shift_imm_q,\
+                        neon_shift_reg,\
+                        neon_shift_reg_q,\
+                        neon_shift_imm_long,\
+                        neon_sat_shift_imm,\
+                        neon_sat_shift_imm_q,\
+                        neon_sat_shift_imm_narrow_q,\
+                        neon_sat_shift_reg,\
+                        neon_sat_shift_reg_q,\
+                        neon_shift_imm_narrow_q,\
+                       "))
+  "decode1op,nothing*2")
+
+;; ADD/SUB: ASIMD arithmetic op.
+;; ADD/SUB/ADDP: ASIMD arithmetic op^Q.
+;; SMAX/SMIN/SABD/SMAXP/SMINP: ASIMD arithmetic op^Q.
+;; UMAX/UMIN/UABD/UMAXP/UMINP: ASIMD arithmetic op^Q.
+;; USQADD/ABS/NEG: ASIMD arithmetic op.
+;; UADDLP/USQADD/ABS/NEG: ASIMD arithmetic op^Q.
+;; SHADD/SQADD/SRHADD/SHSUB/SQSUB: ASIMD arithmetic op^Q.
+;; UHADD/UQADD/URHADD/UHSUB/UQSUB: ASIMD arithmetic op^Q.
+
+;; SHADD/SQADD/SRHADD/SHSUB/SQSUB: ASIMD arithmetic op^Q.
+;; UHADD/UQADD/URHADD/UHSUB/UQSUB: ASIMD arithmetic op^Q.
+;; SADDLP/SUQADD/SQABS/SQNEG: ASIMD arithmetic op^Q.
+;; UADDLP/USQADD/ABS/NEG: ASIMD arithmetic op^Q.
+
+;; CMGT/CMGE/CMTST/CMHI/CMHS/CMEQ (register): ASIMD arithmetic op.
+;; CMGT/CMEQ/CMLT/CMGE/CMLE (zero): ASIMD arithmetic op.
+;; CMGT/CMGE/CMTST/CMHI/CMHS/CMEQ (register): ASIMD arithmetic op^Q.
+;; CMGT/CMEQ/CMLT/CMGE/CMLE (zero): ASIMD arithmetic op^Q.
+(define_insn_reservation "neon_arith" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_add,\
+                        neon_add_q,\
+                        neon_sub,\
+                        neon_sub_q,\
+                        neon_neg,\
+                        neon_neg_q,\
+                        neon_abs,\
+                        neon_abs_q,\
+                        neon_abd_q,\
+                        neon_arith_acc,\
+                        neon_arith_acc_q,\
+                        neon_reduc_add,\
+                        neon_reduc_add_q,\
+                        neon_add_halve,\
+                        neon_add_halve_q,\
+                        neon_sub_halve,\
+                        neon_sub_halve_q,\
+                        neon_qadd,\
+                        neon_qadd_q,\
+                        neon_compare,\
+                        neon_compare_q,\
+                        neon_compare_zero,\
+                        neon_compare_zero_q,\
+                        neon_tst,\
+                        neon_tst_q,\
+                       "))
+  "decode1op,nothing*2")
+
+;; SABA/UABA: (ASIMD arithmetic op + ASIMD arithmetic op)^Q.
+;; 3*3
+(define_insn_reservation "neon_abs_diff" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_arith_acc,neon_arith_acc_q"))
+  "decode2op,nothing*5")
+
+;; MUL/MLA/MLS/SQDMULH/SQRDMULH: ASIMD multiply op^Q.
+;; MUL/SQDMULH/SQRDMULH/MLA/MLS (by element): ASIMD multiply op^Q.
+;; SMLAL/SMLSL/SMULL/SQDMLAL/SQDMLSL/SQDMULL: ASIMD multiply op*2.
+;; SMULL/SMLAL/SMLSL (by element): ASIMD multiply op*2.
+;; UMULL/UMLAL/UMLSL (by element): ASIMD multiply op*2.
+;; FMUL/FMULX/FMLA/FMLS (by element): ASIMD multiply op.
+;; FMUL/FMULX/FMLA/FMLS (by element): ASIMD multiply op^Q.
+
+;; SQDMULH/SQRDMULH: ASIMD multiply op.
+;; SQDMULH/SQRDMULH (by element): ASIMD multiply op.
+;; MUL/MLA/MLS/SQDMULH/SQRDMULH: ASIMD multiply op^Q.
+;; MUL/SQDMULH/SQRDMULH/MLA/MLS (by element): ASIMD multiply op^Q.
+
+;; SQDMULL/SQDMLAL/SQDMLSL (by element): ASIMD multiply op*2.
+;; SQDMULL/SQDMLAL/SQDMLSL: ASIMD multiply op.
+(define_insn_reservation "neon_mul" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_mul_b,\
+                        neon_mul_b_q,\
+                        neon_mul_h,\
+                        neon_mul_h_q,\
+                        neon_mul_s,\
+                        neon_mul_s_q,\
+                        neon_fp_mul_s_scalar,\
+                        neon_fp_mul_s_scalar_q,\
+                        neon_fp_mul_d_scalar_q,\
+                        neon_mla_b,neon_mla_b_q,\
+                        neon_mla_h,neon_mla_h_q,\
+                        neon_mla_s,neon_mla_s_q,\
+                        neon_mla_h_scalar,\
+                        neon_mla_h_scalar_q,\
+                        neon_mla_s_scalar,\
+                        neon_mla_s_scalar_q,\
+                        neon_mla_b_long,\
+                        neon_mla_h_long,\
+                        neon_mla_s_long,\
+                        neon_fp_mul_s,\
+                        neon_fp_mul_s_q,\
+                        neon_fp_mul_d,\
+                        neon_fp_mul_d_q,\
+                        neon_fp_mla_s,\
+                        neon_fp_mla_s_q,\
+                        neon_fp_mla_d,\
+                        neon_fp_mla_d_q,\
+                        neon_fp_mla_s_scalar,\
+                        neon_fp_mla_s_scalar_q,\
+                        neon_fp_mla_d_scalar_q,\
+                        neon_sat_mul_b,\
+                        neon_sat_mul_b_q,\
+                        neon_sat_mul_h,\
+                        neon_sat_mul_h_q,\
+                        neon_sat_mul_s,\
+                        neon_sat_mul_s_q,\
+                        neon_sat_mul_h_scalar,\
+                        neon_sat_mul_h_scalar_q,\
+                        neon_sat_mul_s_scalar,\
+                        neon_sat_mul_s_scalar_q,\
+                        neon_sat_mul_h_scalar_long,\
+                        neon_sat_mul_s_scalar_long,\
+                        neon_sat_mla_b_long,\
+                        neon_sat_mla_h_long,\
+                        neon_sat_mla_s_long,\
+                        neon_sat_mla_h_scalar_long,\
+                        neon_sat_mla_s_scalar_long,\
+                       "))
+  "decode2op,nothing*4")
+
+;; FMULX/FRECPS/FRSQRTS/FABD: FP arithmetic op.
+;; FABD: FP arithmetic op^Q.
+(define_insn_reservation "fp_abd_diff" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_abd_s,\
+                        neon_fp_abd_s_q,\
+                        neon_fp_abd_d,\
+                        neon_fp_abd_d_q,\
+                       "))
+  "decode1op,nothing*4")
+
+;; See #3565
+;; FMUL/FADD/FSUB/FNMUL: FP arithmetic op.
+(define_insn_reservation "neon_f_add" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_addsub_s,\
+                        neon_fp_addsub_s_q,\
+                        neon_fp_addsub_d,\
+                        neon_fp_addsub_d_q,\
+                       "))
+  "decode1op,nothing*5")
+
+;; FDIV: FP divide op^Q.
+(define_insn_reservation "neon_f_div" 28
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_div_s,\
+                        neon_fp_div_s_q,\
+                        neon_fp_div_d,\
+                        neon_fp_div_d_q,\
+                       "))
+  "decode1op,fp_divide*27")
+
+;; FABS/FNEG: FP move op^Q.
+(define_insn_reservation "neon_f_neg" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_neg_s,\
+                        neon_fp_neg_s_q,\
+                        neon_fp_neg_d,\
+                        neon_fp_neg_d_q,\
+                        neon_fp_abs_s,\
+                        neon_fp_abs_s_q,\
+                        neon_fp_abs_d,\
+                        neon_fp_abs_d_q,\
+                       "))
+  "decode1op,nothing")
+
+;; FRINTN/FRINTM/FRINTA/FRINTP/FRINTZ/FRINTX/FRINTI: FP convert op^Q.
+(define_insn_reservation "neon_f_round" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_round_s,\
+                        neon_fp_round_s_q,\
+                        neon_fp_round_d,\
+                        neon_fp_round_d_q,\
+                       "))
+  "decode1op,nothing*4")
+
+;; FCVTNS/FCVTMS/FCVTAS/FCVTPS: FP convert op^Q.
+;; FCVTNU/FCVTMU/FCVTAU/FCVTPU: FP convert op^Q.
+;; FCVTZS/FCVTZU (integer): FP convert op^Q.
+;; FCVTN/FCVTL (size=0): FP half cvt op.
+(define_insn_reservation "neon_f_cvt" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type"  "neon_int_to_fp_s,\
+                         neon_int_to_fp_s_q,\
+                         neon_int_to_fp_d,\
+                         neon_int_to_fp_d_q,\
+                         neon_fp_cvt_widen_s,\
+                         neon_fp_cvt_narrow_s_q,\
+                         neon_fp_cvt_narrow_d_q,\
+                        "))
+  "decode1op,nothing*4")
+
+;; FADD/FSUB/FMULX/FMLA/FMLS/FADDP: FP arithmetic op^Q.
+(define_insn_reservation "neon_f_reduc" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_reduc_add_s,\
+                        neon_fp_reduc_add_s_q,\
+                        neon_fp_reduc_add_d,\
+                        neon_fp_reduc_add_d_q,\
+                       "))
+  "decode1op,nothing*4")
+
+;; CLS/CLZ/CNT/NOT/RBIT: ASIMD logical op^Q.
+;; PMUL: ASIMD logical op^Q.
+(define_insn_reservation "neon_cls" 2
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_cls,neon_cls_q"))
+  "decode1op,nothing")
+
+;; ST1 (one register, 1D): FP store op.
+(define_insn_reservation "neon_st1" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_store1_one_lane,\
+                        neon_store1_one_lane_q,\
+                       "))
+  "decode1op,nothing*3")
+
+;; ADDHN/SUBHN/RADDHN/RSUBHN: ASIMD arithmetic op*2 + ASIMD shift op.
+;; 3 + 3
+(define_insn_reservation "neon_halve_narrow" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_sub_halve_narrow_q,\
+                        neon_add_halve_narrow_q,\
+                       "))
+  "decodeIsolated,nothing*5")
+
+;; SSRA/SRSRA: (ASIMD shift op + ASIMD arithmetic op).
+;; USRA/URSRA: (ASIMD shift op + ASIMD arithmetic op).
+;; SSRA/SRSRA: (ASIMD shift op + ASIMD arithmetic op)^Q.
+;; USRA/URSRA: (ASIMD shift op + ASIMD arithmetic op)^Q.
+;; 3 + 3
+(define_insn_reservation "neon_shift_acc" 6
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_shift_acc,\
+                        neon_shift_acc_q,\
+                       "))
+  "decode2op,nothing*5")
+
+;; FCMEQ/FCMGE/FCMGT/FACGE/FACGT: FP select op.
+;; FCMGT/FCMEQ/FCMLT/FCMGE/FCMLE (zero): FP select op.
+;; FCMEQ/FCMGE/FCMGT/FACGE/FACGT: FP select op^Q.
+;; FCMGT/FCMEQ/FCMLT/FCMGE/FCMLE (zero): FP select op^Q.
+(define_insn_reservation "neon_fp_compare" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_compare_s,\
+                        neon_fp_compare_s_q,\
+                        neon_fp_compare_d,\
+                        neon_fp_compare_d_q,\
+                       "))
+  "decode1op,nothing*2")
+
+;; FSQRT: FP sqrt op.
+(define_insn_reservation "neon_fp_sqrt" 38
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_sqrt_s,\
+                        neon_fp_sqrt_s_q,\
+                        neon_fp_sqrt_d,\
+                        neon_fp_sqrt_d_q,\
+                       "))
+  "decode1op,fp_divide*37")


Similar concern to the integer divide comment above.

+
+;; See #3566
+;; TBL/TBX (single register table): (ASIMD logical op + ASIMD logical op)^Q.
+;; 2 + 2
+(define_insn_reservation "neon_tbl1" 4
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_tbl1,\
+                        neon_tbl1_q,\
+                       "))
+  "decode2op,nothing*2")
+
+;; TBL/TBX (two register table): (ASIMD logical op + ASIMD logical op + ASIMD logical op + ASIMD logical op)^Q.
+;; 2 + 2 + 2 + 2
+(define_insn_reservation "neon_tbl2" 8
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_tbl2,\
+                        neon_tbl2_q,\
+                       "))
+  "decodeIsolated,nothing*7")
+
+;; See #3565
+;; ZIP1/ZIP2/UZP1/UZP2 (Q=0): ASIMD shift op.
+;; ZIP1/ZIP2/UZP1/UZP2 (Q=1, size=11): ASIMD logical op*2.
+;; ZIP1/ZIP2/UZP1/UZP2 (Q=1, size=other): ASIMD shift op*2.
+;; TRN1/TRN2 (size=11): ASIMD logical op*2.
+;; TRN1/TRN2 (size=other): ASIMD shift op^Q.
+(define_insn_reservation "neon_permute" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_permute,\
+                        neon_permute_q,\
+                       "))
+  "decode2op,nothing*2")
+
+;; LD1R: FP load op.
+(define_insn_reservation "neon_ld1r" 10
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_load1_all_lanes,\
+                       "))
+  "decode1op,nothing*9")
+
+;; FRECPE/FRECPX: ASIMD dre op.
+;; FRECPE/FRECPX: ASIMD dre op.
+(define_insn_reservation "neon_fp_recp" 3
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_recpe_s,\
+                        neon_fp_recpe_s_q,\
+                        neon_fp_recpe_d,\
+                        neon_fp_recpe_d_q,\
+                        neon_fp_recpx_s,\
+                        neon_fp_recpx_s_q,\
+                        neon_fp_recpx_d,\
+                        neon_fp_recpx_d_q,\
+                       "))
+  "decode1op,nothing*2")
+
+
+;; FMULX/FRECPS/FRSQRTS/FABD: FP arithmetic op.
+;; FRECPS/FRSQRTS: FP arithmetic op^Q.
+(define_insn_reservation "neon_fp_recp_s" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_fp_recps_s,\
+                        neon_fp_recps_s_q,\
+                        neon_fp_recps_d,\
+                        neon_fp_recps_d_q,\
+                       "))
+  "decode1op,nothing*4")
+
+;; See #3566
+;; PMULL: ASIMD polymul op*2.
+(define_insn_reservation "neon_pmull" 5
+  (and (eq_attr "tune" "xgene1")
+       (eq_attr "type" "neon_mul_d_long,\
+                       "))
+  "decode2op,nothing*4")
--
1.9.1

References:
- [PATCH 0/2, AArch64] APM X-Gene 1 cost-table and pipeline model
  - From: Philipp Tomsich
- [PATCH 2/2, AArch64] Pipeline model for APM XGene-1.
  - From: Philipp Tomsich

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]