This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Prefetch for i386


Hi
This patch adds support for prefetch/prefetchw instructions to IA-32.  Both MMX
and SSE prefetch instructions are supported.  The MMX prefetch has advantage of
prefetchw instruction and the SSE prefetch is good idea since it is executed as
nop by earlier PPros/PIIs/celerons.  Currently I am using prefetch(w) for
K6/Athlon and prefetchnta for PPro.

Honza

Thu Apr 13 10:46:44 CEST 2000  Jan Hubicka  <jh@suse.cz>

	* i386.c (*_cost): Initialize prefetch_block_size and
	simultatenous_prefetches.
	(x86_3dNOW, x86_SSE): New global variables.
	(ix86_attr_length_default): Handle TYPE_PREFETCH.
	* i386.h (struct processor_costs): Add simultatenous_prefetches
	and prefetch_block.
	(ARCHMASK): New macro.
	(x86_3dNOW, x86_SSE): Declare.
	(TARGET_CMOVE): Use ARCHMASK
	(TARGET_3DNOW, TARGET_SSE): New macros.
	(PREFETCH_BLOCK, SIMULTATENOUS_PREFETCHES): New macros.
	* i386.md (attr type): Add "prefetch".
	(attr length_opcode): Support prefetch type.
	(attr memory_operand, k6_alu, athlon_ieu): Likewise
	(prefetch expander): New.
	(prefetch and prefetchw patterns): New.

*** ../../egcs.noprefetch/gcc/config/i386/i386.c	Sun Apr 30 22:23:47 2000
--- config/i386/i386.c	Sun Apr 30 22:51:00 2000
*************** struct processor_costs i386_cost = {	/* 
*** 74,80 ****
    2,					/* cost of reg,reg fld/fst */
    {8, 8, 8},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {8, 8, 8}				/* cost of loading integer registers */
  };
  
  struct processor_costs i486_cost = {	/* 486 specific costs */
--- 74,82 ----
    2,					/* cost of reg,reg fld/fst */
    {8, 8, 8},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {8, 8, 8},				/* cost of loading integer registers */
!   0,					/* size of prefetch block */
!   0,					/* number of prefetches doable in parallel */
  };
  
  struct processor_costs i486_cost = {	/* 486 specific costs */
*************** struct processor_costs i486_cost = {	/* 
*** 95,101 ****
    2,					/* cost of reg,reg fld/fst */
    {8, 8, 8},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {8, 8, 8}				/* cost of loading integer registers */
  };
  
  struct processor_costs pentium_cost = {
--- 97,105 ----
    2,					/* cost of reg,reg fld/fst */
    {8, 8, 8},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {8, 8, 8},				/* cost of loading integer registers */
!   0,					/* size of prefetch block */
!   0,					/* number of prefetches doable in parallel */
  };
  
  struct processor_costs pentium_cost = {
*************** struct processor_costs pentium_cost = {
*** 116,122 ****
    2,					/* cost of reg,reg fld/fst */
    {2, 2, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 6}				/* cost of loading integer registers */
  };
  
  struct processor_costs pentiumpro_cost = {
--- 120,128 ----
    2,					/* cost of reg,reg fld/fst */
    {2, 2, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 6},				/* cost of loading integer registers */
!   0,					/* size of prefetch block */
!   0,					/* number of prefetches doable in parallel */
  };
  
  struct processor_costs pentiumpro_cost = {
*************** struct processor_costs pentiumpro_cost =
*** 137,143 ****
    2,					/* cost of reg,reg fld/fst */
    {2, 2, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 6}				/* cost of loading integer registers */
  };
  
  struct processor_costs k6_cost = {
--- 143,154 ----
    2,					/* cost of reg,reg fld/fst */
    {2, 2, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 6},				/* cost of loading integer registers */
!   32,					/* size of prefetch block */
!   6					/* number of prefetches doable in
! 					   parallel */
! 					/* ??? Guess, only most recent PPRO
! 					   familly CPUs do non-NOP prefetch.  */
  };
  
  struct processor_costs k6_cost = {
*************** struct processor_costs k6_cost = {
*** 158,164 ****
    4,					/* cost of reg,reg fld/fst */
    {6, 6, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 4}				/* cost of loading integer registers */
  };
  
  struct processor_costs athlon_cost = {
--- 169,178 ----
    4,					/* cost of reg,reg fld/fst */
    {6, 6, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 4},				/* cost of loading integer registers */
!   32,					/* size of prefetch block */
!   1					/* number of prefetches doable in
! 					   parallel */
  };
  
  struct processor_costs athlon_cost = {
*************** struct processor_costs athlon_cost = {
*** 179,185 ****
    4,					/* cost of reg,reg fld/fst */
    {6, 6, 20},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 16}				/* cost of loading integer registers */
  };
  
  struct processor_costs *ix86_cost = &pentium_cost;
--- 193,202 ----
    4,					/* cost of reg,reg fld/fst */
    {6, 6, 20},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 16},				/* cost of loading integer registers */
!   64,					/* size of prefetch block */
!   6					/* number of prefetches doable in
! 					   parallel */
  };
  
  struct processor_costs *ix86_cost = &pentium_cost;
*************** const int x86_add_esp_8 = m_ATHLON | m_P
*** 225,230 ****
--- 242,249 ----
  const int x86_integer_DFmode_moves = ~m_ATHLON;
  const int x86_partial_reg_dependency = m_ATHLON;
  const int x86_memory_mismatch_stall = m_ATHLON;
+ const int x86_3dNOW = m_ATHLON | m_K6;
+ const int x86_SSE = m_ATHLON | m_PPRO;
  
  #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
  
*************** ix86_attr_length_default (insn)
*** 6161,6166 ****
--- 6180,6186 ----
      case TYPE_IDIV:
      case TYPE_PUSH:
      case TYPE_POP:
+     case TYPE_PREFETCH:
        for (i = recog_data.n_operands - 1; i >= 0; --i)
          if (CONSTANT_P (recog_data.operand[i]))
  	  {
*** ../../egcs.noprefetch/gcc/config/i386/i386.h	Sun Apr 30 22:23:47 2000
--- config/i386/i386.h	Sun Apr 30 22:25:52 2000
*************** struct processor_costs {
*** 72,77 ****
--- 72,81 ----
  				   in SFmode, DFmode and XFmode */
    int fp_store[3];		/* cost of storing FP register
  				   in SFmode, DFmode and XFmode */
+   int prefetch_block;		/* Size of block read by single
+ 				   prefetch operation. */
+   int simultatenous_prefetches; /* Number of prefetch operations
+ 				   doable in parallel.  */
  };
  
  extern struct processor_costs *ix86_cost;
*************** extern int target_flags;
*** 164,169 ****
--- 168,174 ----
  #define TARGET_ATHLON (ix86_cpu == PROCESSOR_ATHLON)
  
  #define CPUMASK (1 << ix86_cpu)
+ #define ARCHMASK (1 << ix86_arch)
  extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
  extern const int x86_use_bit_test, x86_cmove, x86_deep_branch;
  extern const int x86_unroll_strlen, x86_use_q_reg, x86_use_any_reg;
*************** extern const int x86_himode_math, x86_qi
*** 176,181 ****
--- 181,187 ----
  extern const int x86_promote_hi_regs, x86_integer_DFmode_moves;
  extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
  extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
+ extern const int x86_3dNOW, x86_SSE;
  
  #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
  #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
*************** extern const int x86_partial_reg_depende
*** 184,190 ****
  #define TARGET_UNROLL_STRLEN (x86_unroll_strlen & CPUMASK)
  #define TARGET_USE_Q_REG (x86_use_q_reg & CPUMASK)
  #define TARGET_USE_ANY_REG (x86_use_any_reg & CPUMASK)
! #define TARGET_CMOVE (x86_cmove & (1 << ix86_arch))
  #define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & CPUMASK)
  #define TARGET_DOUBLE_WITH_ADD (x86_double_with_add & CPUMASK)
  #define TARGET_USE_SAHF (x86_use_sahf & CPUMASK)
--- 190,196 ----
  #define TARGET_UNROLL_STRLEN (x86_unroll_strlen & CPUMASK)
  #define TARGET_USE_Q_REG (x86_use_q_reg & CPUMASK)
  #define TARGET_USE_ANY_REG (x86_use_any_reg & CPUMASK)
! #define TARGET_CMOVE (x86_cmove & ARCHMASK)
  #define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & CPUMASK)
  #define TARGET_DOUBLE_WITH_ADD (x86_double_with_add & CPUMASK)
  #define TARGET_USE_SAHF (x86_use_sahf & CPUMASK)
*************** extern const int x86_partial_reg_depende
*** 210,215 ****
--- 216,223 ----
  #define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & CPUMASK)
  #define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & CPUMASK)
  #define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK)
+ #define TARGET_3DNOW (x86_3dNOW & ARCHMASK)
+ #define TARGET_SSE (x86_SSE & ARCHMASK)
  
  #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
  
*************** while (0)
*** 1759,1764 ****
--- 1767,1778 ----
  /* Max number of bytes we can move from memory to memory
     in one reasonably fast instruction.  */
  #define MOVE_MAX 4
+ 
+ /* Size of block read by single prefetch operation.  */
+ #define PREFETCH_BLOCK ix86_cost->prefetch_block
+ 
+ /* Number of prefetch operations doable in parallel.  */
+ #define SIMULTATENOUS_PREFETCHES ix86_cost->simultatenous_prefetches
  
  /* If a memory-to-memory move would take MOVE_RATIO or more simple
     move-instruction pairs, we will do a movstr or libcall instead.
*** ../../egcs.noprefetch/gcc/config/i386/i386.md	Sun Apr 30 22:23:47 2000
--- config/i386/i386.md	Sun Apr 30 22:26:11 2000
***************
*** 71,76 ****
--- 71,78 ----
  ;; 9  This is an `fnstsw' operation.
  ;; 10 This is a `sahf' operation.
  ;; 11 This is a `fstcw' operation
+ ;; 12 This is a prefetch operation
+ ;; 13 This is a prefetchw operation
  ;;
  ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
  ;; from i386.c.
***************
*** 84,90 ****
  ;; A basic instruction type.  Refinements due to arguments to be
  ;; provided in other attributes.
  (define_attr "type"
!   "other,multi,alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,fmov,fop,fop1,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,str,cld"
    (const_string "other"))
  
  ;; The (bounding maximum) length of an instruction in bytes.
--- 86,92 ----
  ;; A basic instruction type.  Refinements due to arguments to be
  ;; provided in other attributes.
  (define_attr "type"
!   "other,multi,alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,fmov,fop,fop1,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,str,cld,prefetch"
    (const_string "other"))
  
  ;; The (bounding maximum) length of an instruction in bytes.
***************
*** 104,110 ****
  
  ;; Supporting: bytes in the opcode+modrm.
  (define_attr "length_opcode" ""
!   (cond [(eq_attr "type" "imovx,setcc,icmov")
  	   (const_int 3)
  	 (eq_attr "type" "str,cld")
  	   (const_int 1)
--- 106,112 ----
  
  ;; Supporting: bytes in the opcode+modrm.
  (define_attr "length_opcode" ""
!   (cond [(eq_attr "type" "imovx,setcc,icmov,prefetch")
  	   (const_int 3)
  	 (eq_attr "type" "str,cld")
  	   (const_int 1)
***************
*** 147,152 ****
--- 149,156 ----
  			      (match_operand 1 "memory_operand" ""))
  	     (const_string "load")
  	     (const_string "none"))
+ 	 (eq_attr "type" "prefetch")
+ 	   (const_string "load")
  	 (eq_attr "type" "ibr")
  	   (if_then_else (match_operand 0 "memory_operand" "")
  	     (const_string "load")
***************
*** 637,643 ****
  
  (define_function_unit "k6_alu" 2 0
    (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "ishift,alu1,negnot,alu,icmp,imovx,incdec,setcc,lea"))
    1 1)
  
  (define_function_unit "k6_alu" 2 0
--- 641,647 ----
  
  (define_function_unit "k6_alu" 2 0
    (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "ishift,alu1,negnot,alu,icmp,imovx,incdec,setcc,lea,prefetch"))
    1 1)
  
  (define_function_unit "k6_alu" 2 0
***************
*** 766,772 ****
  
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,ibr,call,callv,icmov,cld,pop,setcc,push,pop"))
    1 1)
  
  (define_function_unit "athlon_ieu" 3 0
--- 770,776 ----
  
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,ibr,call,callv,icmov,cld,pop,setcc,push,pop,prefetch"))
    1 1)
  
  (define_function_unit "athlon_ieu" 3 0
***************
*** 9330,9335 ****
--- 9334,9364 ----
     fcmov%F1\\t{%2, %0|%0, %2}
     fcmov%f1\\t{%3, %0|%0, %3}"
    [(set_attr "type" "fcmov")])
+ 
+ ;; Prefetch patterns
+ 
+ (define_expand "prefetch"
+   [(unspec [(match_operand:SI 0 "address_operand" "")] 12)]
+   "TARGET_3DNOW || TARGET_SSE"
+   "")
+ 
+ (define_insn ""
+   [(unspec [(match_operand:SI 0 "address_operand" "p")] 12)]
+   "TARGET_3DNOW"
+   "prefetch\\t%a0"
+   [(set_attr "type" "prefetch")])
+ 
+ (define_insn ""
+   [(unspec [(match_operand:SI 0 "address_operand" "p")] 12)]
+   "TARGET_SSE"
+   "prefetchnta\\t%a0"
+   [(set_attr "type" "prefetch")])
+ 
+ (define_insn "prefetchw"
+   [(unspec [(match_operand:SI 0 "address_operand" "p")] 13)]
+   "TARGET_3DNOW"
+   "prefetchw\\t%a0"
+   [(set_attr "type" "prefetch")])
  
  ;; Misc patterns (?)
  

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]