This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
DFA updates for the PA
- From: law at redhat dot com
- To: gcc-patches at gcc dot gnu dot org
- Date: Mon, 06 May 2002 10:56:05 -0600
- Subject: DFA updates for the PA
- Reply-to: law at redhat dot com
Various minor updates to the scheduling description for the PA. The primary
change is using the bypass capabilities in the md file instead of adjust_cost.
Installed into the mainline sources.
* pa-protos.h (hppa_fpstore_bypass_p): Declare.
* pa.c (pa_adjust_cost): Remove all true dependency cost
adjustments. Also remove support for non-DFA scheduling.
* pa.md (700, 7100, 7100lc, 7200, 7300): Use bypass mechanism
to adjust true dependency costs. Update various comments.
(7100lc, 7200, 7300 scheduling): Simplify by combining the
FP ALU & MPY units into a single unit.
Index: pa-protos.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/pa/pa-protos.h,v
retrieving revision 1.11
diff -c -3 -p -r1.11 pa-protos.h
*** pa-protos.h 21 Jan 2002 22:57:48 -0000 1.11
--- pa-protos.h 6 May 2002 16:48:46 -0000
*************** extern int is_function_label_plus_const
*** 103,108 ****
--- 103,109 ----
extern int jump_in_call_delay PARAMS ((rtx));
extern enum reg_class secondary_reload_class PARAMS ((enum reg_class,
enum machine_mode, rtx));
+ extern int hppa_fpstore_bypass_p PARAMS ((rtx, rtx));
/* Declare functions defined in pa.c and used in templates. */
Index: pa.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/pa/pa.c,v
retrieving revision 1.155
diff -c -3 -p -r1.155 pa.c
*** pa.c 30 Apr 2002 16:47:43 -0000 1.155
--- pa.c 6 May 2002 16:48:51 -0000
*************** hppa_use_dfa_pipeline_interface ()
*** 60,65 ****
--- 60,92 ----
return 1;
}
+ /* Return nonzero if there is a bypass for the output of
+ OUT_INSN and the fp store IN_INSN. */
+ int
+ hppa_fpstore_bypass_p (out_insn, in_insn)
+ rtx out_insn, in_insn;
+ {
+ enum machine_mode store_mode;
+ enum machine_mode other_mode;
+ rtx set;
+
+ if (recog_memoized (in_insn) < 0
+ || get_attr_type (in_insn) != TYPE_FPSTORE
+ || recog_memoized (out_insn) < 0)
+ return 0;
+
+ store_mode = GET_MODE (SET_SRC (PATTERN (in_insn)));
+
+ set = single_set (out_insn);
+ if (!set)
+ return 0;
+
+ other_mode = GET_MODE (SET_SRC (set));
+
+ return (GET_MODE_SIZE (store_mode) == GET_MODE_SIZE (other_mode));
+ }
+
+
#ifndef DO_FRAME_NOTES
#ifdef INCOMING_RETURN_ADDR_RTX
#define DO_FRAME_NOTES 1
*************** pa_adjust_cost (insn, link, dep_insn, co
*** 3907,3914 ****
{
enum attr_type attr_type;
! /* Don't adjust costs for a pa8000 chip. */
! if (pa_cpu >= PROCESSOR_8000)
return cost;
if (! recog_memoized (insn))
--- 3934,3942 ----
{
enum attr_type attr_type;
! /* Don't adjust costs for a pa8000 chip, also do not adjust any
! true dependencies as they are described with bypasses now. */
! if (pa_cpu >= PROCESSOR_8000 || REG_NOTE_KIND (link) == 0)
return cost;
if (! recog_memoized (insn))
*************** pa_adjust_cost (insn, link, dep_insn, co
*** 3916,3980 ****
attr_type = get_attr_type (insn);
! if (REG_NOTE_KIND (link) == 0)
! {
! /* Data dependency; DEP_INSN writes a register that INSN reads some
! cycles later. */
!
! if (attr_type == TYPE_FPSTORE)
! {
! rtx pat = PATTERN (insn);
! rtx dep_pat = PATTERN (dep_insn);
! if (GET_CODE (pat) == PARALLEL)
! {
! /* This happens for the fstXs,mb patterns. */
! pat = XVECEXP (pat, 0, 0);
! }
! if (GET_CODE (pat) != SET || GET_CODE (dep_pat) != SET)
! /* If this happens, we have to extend this to schedule
! optimally. Return 0 for now. */
! return 0;
!
! if (rtx_equal_p (SET_DEST (dep_pat), SET_SRC (pat)))
! {
! if (! recog_memoized (dep_insn))
! return 0;
! /* DEP_INSN is writing its result to the register
! being stored in the fpstore INSN. */
! switch (get_attr_type (dep_insn))
! {
! case TYPE_FPLOAD:
! /* This cost 3 cycles, not 2 as the md says for the
! 700 and 7100, 7100lc, 7200 and 7300. */
! return cost + 1;
!
! case TYPE_FPALU:
! case TYPE_FPMULSGL:
! case TYPE_FPMULDBL:
! case TYPE_FPDIVSGL:
! case TYPE_FPDIVDBL:
! case TYPE_FPSQRTSGL:
! case TYPE_FPSQRTDBL:
! /* In these important cases, we save one cycle compared to
! when flop instruction feed each other. */
! return cost - 1;
!
! default:
! return cost;
! }
! }
!
! /* A flop-flop true depenendency where the sizes of the operand
! carrying the dependency is difference causes an additional
! cycle stall on the 7100lc, 7200, and 7300. Similarly for
! a fpload-flop true dependency. */
! }
!
! /* For other data dependencies, the default cost specified in the
! md is correct. */
! return cost;
! }
! else if (REG_NOTE_KIND (link) == REG_DEP_ANTI)
{
/* Anti dependency; DEP_INSN reads a register that INSN writes some
cycles later. */
--- 3944,3950 ----
attr_type = get_attr_type (insn);
! if (REG_NOTE_KIND (link) == REG_DEP_ANTI)
{
/* Anti dependency; DEP_INSN reads a register that INSN writes some
cycles later. */
*************** pa_adjust_cost (insn, link, dep_insn, co
*** 4010,4019 ****
preceding arithmetic operation has finished if
the target of the fpload is any of the sources
(or destination) of the arithmetic operation. */
! if (hppa_use_dfa_pipeline_interface ())
! return insn_default_latency (dep_insn) - 1;
! else
! return cost - 1;
default:
return 0;
--- 3980,3986 ----
preceding arithmetic operation has finished if
the target of the fpload is any of the sources
(or destination) of the arithmetic operation. */
! return insn_default_latency (dep_insn) - 1;
default:
return 0;
*************** pa_adjust_cost (insn, link, dep_insn, co
*** 4048,4057 ****
preceding divide or sqrt operation has finished if
the target of the ALU flop is any of the sources
(or destination) of the divide or sqrt operation. */
! if (hppa_use_dfa_pipeline_interface ())
! return insn_default_latency (dep_insn) - 2;
! else
! return cost - 2;
default:
return 0;
--- 4015,4021 ----
preceding divide or sqrt operation has finished if
the target of the ALU flop is any of the sources
(or destination) of the divide or sqrt operation. */
! return insn_default_latency (dep_insn) - 2;
default:
return 0;
*************** pa_adjust_cost (insn, link, dep_insn, co
*** 4101,4110 ****
Exception: For PA7100LC, PA7200 and PA7300, the cost
is 3 cycles, unless they bundle together. We also
pay the penalty if the second insn is a fpload. */
! if (hppa_use_dfa_pipeline_interface ())
! return insn_default_latency (dep_insn) - 1;
! else
! return cost - 1;
default:
return 0;
--- 4065,4071 ----
Exception: For PA7100LC, PA7200 and PA7300, the cost
is 3 cycles, unless they bundle together. We also
pay the penalty if the second insn is a fpload. */
! return insn_default_latency (dep_insn) - 1;
default:
return 0;
*************** pa_adjust_cost (insn, link, dep_insn, co
*** 4139,4148 ****
preceding divide or sqrt operation has finished if
the target of the ALU flop is also the target of
the divide or sqrt operation. */
! if (hppa_use_dfa_pipeline_interface ())
! return insn_default_latency (dep_insn) - 2;
! else
! return cost - 2;
default:
return 0;
--- 4100,4106 ----
preceding divide or sqrt operation has finished if
the target of the ALU flop is also the target of
the divide or sqrt operation. */
! return insn_default_latency (dep_insn) - 2;
default:
return 0;
Index: pa.md
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/pa/pa.md,v
retrieving revision 1.105
diff -c -3 -p -r1.105 pa.md
*** pa.md 30 Apr 2002 15:32:10 -0000 1.105
--- pa.md 6 May 2002 16:48:57 -0000
***************
*** 206,225 ****
"fpmpy_700*18")
(define_insn_reservation "W7" 2
! (and (eq_attr "type" "load,fpload")
(eq_attr "cpu" "700"))
"mem_700")
! (define_insn_reservation "W8" 3
! (and (eq_attr "type" "store,fpstore")
(eq_attr "cpu" "700"))
"mem_700*3")
! (define_insn_reservation "W9" 1
(and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpdivdbl,fpsqr
tsgl,fpsqrtdbl,load,fpload,store,fpstore")
(eq_attr "cpu" "700"))
"dummy_700")
;; Function units for the 7100 and 7150. The 7100/7150 can dual-issue
;; floating point computations with non-floating point computations (fp loads
;; and stores are not fp computations).
--- 206,246 ----
"fpmpy_700*18")
(define_insn_reservation "W7" 2
! (and (eq_attr "type" "load")
(eq_attr "cpu" "700"))
"mem_700")
! (define_insn_reservation "W8" 2
! (and (eq_attr "type" "fpload")
! (eq_attr "cpu" "700"))
! "mem_700")
!
! (define_insn_reservation "W9" 3
! (and (eq_attr "type" "store")
! (eq_attr "cpu" "700"))
! "mem_700*3")
!
! (define_insn_reservation "W10" 3
! (and (eq_attr "type" "fpstore")
(eq_attr "cpu" "700"))
"mem_700*3")
! (define_insn_reservation "W11" 1
(and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpdivdbl,fpsqr
tsgl,fpsqrtdbl,load,fpload,store,fpstore")
(eq_attr "cpu" "700"))
"dummy_700")
+ ;; We have a bypass for all computations in the FP unit which feed an
+ ;; FP store as long as the sizes are the same.
+ (define_bypass 2 "W1,W2" "W10" "hppa_fpstore_bypass_p")
+ (define_bypass 9 "W3" "W10" "hppa_fpstore_bypass_p")
+ (define_bypass 11 "W4" "W10" "hppa_fpstore_bypass_p")
+ (define_bypass 13 "W5" "W10" "hppa_fpstore_bypass_p")
+ (define_bypass 17 "W6" "W10" "hppa_fpstore_bypass_p")
+
+ ;; We have an "anti-bypass" for FP loads which feed an FP store.
+ (define_bypass 4 "W8" "W10" "hppa_fpstore_bypass_p")
+
;; Function units for the 7100 and 7150. The 7100/7150 can dual-issue
;; floating point computations with non-floating point computations (fp loads
;; and stores are not fp computations).
***************
*** 228,235 ****
;; take two cycles, during which no Dcache operations should be scheduled.
;; Any special cases are handled in pa_adjust_cost. The 7100, 7150 and
7100LC
;; all have the same memory characteristics if one disregards cache misses.
!
;; The 7100/7150 has three floating-point units: ALU, MUL, and DIV.
;; Timings:
;; Instruction Time Unit Minimum Distance (unit contention)
;; fcpy 2 ALU 1
--- 249,260 ----
;; take two cycles, during which no Dcache operations should be scheduled.
;; Any special cases are handled in pa_adjust_cost. The 7100, 7150 and
7100LC
;; all have the same memory characteristics if one disregards cache misses.
! ;;
;; The 7100/7150 has three floating-point units: ALU, MUL, and DIV.
+ ;; There's no value in modeling the ALU and MUL separately though
+ ;; since there can never be a functional unit conflict given the
+ ;; latency and issue rates for those units.
+ ;;
;; Timings:
;; Instruction Time Unit Minimum Distance (unit contention)
;; fcpy 2 ALU 1
***************
*** 247,257 ****
;; fdiv,dbl 15 DIV 15
;; fsqrt,sgl 8 DIV 8
;; fsqrt,dbl 15 DIV 15
- ;;
- ;; We don't really model the FP ALU/MPY units properly (they are
- ;; distinct subunits in the FP unit). However, there can never be
- ;; a functional unit; conflict given the latency and issue rates
- ;; for those units.
(define_automaton "pa7100")
(define_cpu_unit "i_7100, f_7100,fpmac_7100,fpdivsqrt_7100,mem_7100"
"pa7100")
--- 272,277 ----
***************
*** 272,292 ****
"f_7100+fpdivsqrt_7100,fpdivsqrt_7100*14")
(define_insn_reservation "X3" 2
! (and (eq_attr "type" "load,fpload")
(eq_attr "cpu" "7100"))
"i_7100+mem_7100")
(define_insn_reservation "X4" 2
! (and (eq_attr "type" "store,fpstore")
(eq_attr "cpu" "7100"))
"i_7100+mem_7100,mem_7100")
! (define_insn_reservation "X5" 1
(and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdi
vdbl,fpsqrtdbl,load,fpload,store,fpstore")
(eq_attr "cpu" "7100"))
"i_7100")
;; The 7100LC has three floating-point units: ALU, MUL, and DIV.
;; Timings:
;; Instruction Time Unit Minimum Distance (unit contention)
;; fcpy 2 ALU 1
--- 292,336 ----
"f_7100+fpdivsqrt_7100,fpdivsqrt_7100*14")
(define_insn_reservation "X3" 2
! (and (eq_attr "type" "load")
(eq_attr "cpu" "7100"))
"i_7100+mem_7100")
(define_insn_reservation "X4" 2
! (and (eq_attr "type" "fpload")
! (eq_attr "cpu" "7100"))
! "i_7100+mem_7100")
!
! (define_insn_reservation "X5" 2
! (and (eq_attr "type" "store")
(eq_attr "cpu" "7100"))
"i_7100+mem_7100,mem_7100")
! (define_insn_reservation "X6" 2
! (and (eq_attr "type" "fpstore")
! (eq_attr "cpu" "7100"))
! "i_7100+mem_7100,mem_7100")
!
! (define_insn_reservation "X7" 1
(and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdi
vdbl,fpsqrtdbl,load,fpload,store,fpstore")
(eq_attr "cpu" "7100"))
"i_7100")
+ ;; We have a bypass for all computations in the FP unit which feed an
+ ;; FP store as long as the sizes are the same.
+ (define_bypass 1 "X0" "X6" "hppa_fpstore_bypass_p")
+ (define_bypass 7 "X1" "X6" "hppa_fpstore_bypass_p")
+ (define_bypass 14 "X2" "X6" "hppa_fpstore_bypass_p")
+
+ ;; We have an "anti-bypass" for FP loads which feed an FP store.
+ (define_bypass 3 "X4" "X6" "hppa_fpstore_bypass_p")
+
;; The 7100LC has three floating-point units: ALU, MUL, and DIV.
+ ;; There's no value in modeling the ALU and MUL separately though
+ ;; since there can never be a functional unit conflict that
+ ;; can be avoided given the latency, issue rates and mandatory
+ ;; one cycle cpu-wide lock for a double precision fp multiply.
+ ;;
;; Timings:
;; Instruction Time Unit Minimum Distance (unit contention)
;; fcpy 2 ALU 1
***************
*** 321,349 ****
;;
;; load-load pairs
;; store-store pairs
- ;; fmpyadd,dbl
- ;; fmpysub,dbl
;; other issue modeling
(define_automaton "pa7100lc")
(define_cpu_unit "i0_7100lc, i1_7100lc, f_7100lc" "pa7100lc")
! (define_cpu_unit "fpalu_7100lc,fpmul_7100lc" "pa7100lc")
(define_cpu_unit "mem_7100lc" "pa7100lc")
- (define_insn_reservation "Y0" 2
- (and (eq_attr "type" "fpcc,fpalu")
- (eq_attr "cpu" "7100LC,7200,7300"))
- "f_7100lc,fpalu_7100lc")
-
;; Double precision multiplies lock the entire CPU for one
;; cycle. There is no way to avoid this lock and trying to
;; schedule around the lock is pointless and thus there is no
! ;; value in trying to model this lock. Not modeling the lock
! ;; allows for a smaller DFA and may reduce register pressure.
! (define_insn_reservation "Y1" 2
! (and (eq_attr "type" "fpmulsgl,fpmuldbl")
(eq_attr "cpu" "7100LC,7200,7300"))
! "f_7100lc,fpmul_7100lc")
;; fp division and sqrt instructions lock the entire CPU for
;; 7 cycles (single precision) or 14 cycles (double precision).
--- 365,389 ----
;;
;; load-load pairs
;; store-store pairs
;; other issue modeling
(define_automaton "pa7100lc")
(define_cpu_unit "i0_7100lc, i1_7100lc, f_7100lc" "pa7100lc")
! (define_cpu_unit "fpmac_7100lc" "pa7100lc")
(define_cpu_unit "mem_7100lc" "pa7100lc")
;; Double precision multiplies lock the entire CPU for one
;; cycle. There is no way to avoid this lock and trying to
;; schedule around the lock is pointless and thus there is no
! ;; value in trying to model this lock.
! ;;
! ;; Not modeling the lock allows us to treat fp multiplies just
! ;; like any other FP alu instruction. It allows for a smaller
! ;; DFA and may reduce register pressure.
! (define_insn_reservation "Y0" 2
! (and (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
(eq_attr "cpu" "7100LC,7200,7300"))
! "f_7100lc,fpmac_7100lc")
;; fp division and sqrt instructions lock the entire CPU for
;; 7 cycles (single precision) or 14 cycles (double precision).
***************
*** 351,392 ****
;; around the lock is pointless and thus there is no value in
;; trying to model this lock. Not modeling the lock allows
;; for a smaller DFA and may reduce register pressure.
! (define_insn_reservation "Y2" 1
(and (eq_attr "type" "fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
(eq_attr "cpu" "7100LC,7200,7300"))
"f_7100lc")
(define_insn_reservation "Y3" 2
! (and (eq_attr "type" "load,fpload")
(eq_attr "cpu" "7100LC,7200,7300"))
"i1_7100lc+mem_7100lc")
(define_insn_reservation "Y4" 2
! (and (eq_attr "type" "store,fpstore")
(eq_attr "cpu" "7100LC"))
"i1_7100lc+mem_7100lc,mem_7100lc")
! (define_insn_reservation "Y5" 1
(and (eq_attr "type" "shift,nullshift")
(eq_attr "cpu" "7100LC,7200,7300"))
"i1_7100lc")
! (define_insn_reservation "Y6" 1
(and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdi
vdbl,fpsqrtdbl,load,fpload,store,fpstore,shift,nullshift")
(eq_attr "cpu" "7100LC,7200,7300"))
"(i0_7100lc|i1_7100lc)")
;; The 7200 has a store-load penalty
! (define_insn_reservation "Y7" 2
! (and (eq_attr "type" "store,fpstore")
(eq_attr "cpu" "7200"))
"i1_7100lc,mem_7100lc")
;; The 7300 has no penalty for store-store or store-load
! (define_insn_reservation "Y8" 2
! (and (eq_attr "type" "store,fpstore")
(eq_attr "cpu" "7300"))
"i1_7100lc")
;; Scheduling for the PA8000 is somewhat different than scheduling for a
;; traditional architecture.
--- 391,455 ----
;; around the lock is pointless and thus there is no value in
;; trying to model this lock. Not modeling the lock allows
;; for a smaller DFA and may reduce register pressure.
! (define_insn_reservation "Y1" 1
(and (eq_attr "type" "fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
(eq_attr "cpu" "7100LC,7200,7300"))
"f_7100lc")
+ (define_insn_reservation "Y2" 2
+ (and (eq_attr "type" "load")
+ (eq_attr "cpu" "7100LC,7200,7300"))
+ "i1_7100lc+mem_7100lc")
+
(define_insn_reservation "Y3" 2
! (and (eq_attr "type" "fpload")
(eq_attr "cpu" "7100LC,7200,7300"))
"i1_7100lc+mem_7100lc")
(define_insn_reservation "Y4" 2
! (and (eq_attr "type" "store")
! (eq_attr "cpu" "7100LC"))
! "i1_7100lc+mem_7100lc,mem_7100lc")
!
! (define_insn_reservation "Y5" 2
! (and (eq_attr "type" "fpstore")
(eq_attr "cpu" "7100LC"))
"i1_7100lc+mem_7100lc,mem_7100lc")
! (define_insn_reservation "Y6" 1
(and (eq_attr "type" "shift,nullshift")
(eq_attr "cpu" "7100LC,7200,7300"))
"i1_7100lc")
! (define_insn_reservation "Y7" 1
(and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdi
vdbl,fpsqrtdbl,load,fpload,store,fpstore,shift,nullshift")
(eq_attr "cpu" "7100LC,7200,7300"))
"(i0_7100lc|i1_7100lc)")
;; The 7200 has a store-load penalty
! (define_insn_reservation "Y8" 2
! (and (eq_attr "type" "store")
! (eq_attr "cpu" "7200"))
! "i1_7100lc,mem_7100lc")
!
! (define_insn_reservation "Y9" 2
! (and (eq_attr "type" "fpstore")
(eq_attr "cpu" "7200"))
"i1_7100lc,mem_7100lc")
;; The 7300 has no penalty for store-store or store-load
! (define_insn_reservation "Y10" 2
! (and (eq_attr "type" "store")
(eq_attr "cpu" "7300"))
"i1_7100lc")
+
+ (define_insn_reservation "Y11" 2
+ (and (eq_attr "type" "fpstore")
+ (eq_attr "cpu" "7300"))
+ "i1_7100lc")
+
+ ;; We have an "anti-bypass" for FP loads which feed an FP store.
+ (define_bypass 3 "Y3" "Y5,Y9,Y11" "hppa_fpstore_bypass_p")
;; Scheduling for the PA8000 is somewhat different than scheduling for a
;; traditional architecture.