This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

simple pentium4 support



Hi
This patch adds tiny bit of pentium4 tunning.  It's primary purpose is
to add single architecture with SSE/SSE2 support, so I can work on -march
based SSE enabling.

The patch misses scheduling definitions, since I want to wait for Vlad/Bernd
to settle down the scheduler patches to see what syntax it will use.

The benefits are currently about 10% in my benchmark - the pentium4 is
very touch about the code it is executing.
Lots of latencies are guesses, since Intel didn't provided much documentation
concerning the load/store unit.

Honza

Tue Feb 27 14:22:01 CET 2001  Jan Hubicka  <jh@suse.cz>
	* i386.c (pentium4_cost): New.
	(m_PENT4): New macro.
	(x86_push_memory, x86_movx,x86_cmove, x86_deep_branch, x86_use_sahf
	x86_sub_esp_4, x86_sub_esp_8, x86_add_esp_4, x86_add_esp_8
	x86_integer_DFmode_moves, x86_partial_reg_dependency,
	x86_memory_mismatch_stall): Add Pentium4
	(x86_use_q_reg, x86_use_any_reg): Kill.
	(override_options): Add pentium4.
	(incdec_operand): Return 0 for pentium4.
	(ix86_issue_rate): Add PROCESSOR_PENTIUM4 and PROCESSOR_ATHLON.
	* i386.h (x86_use_q_reg, x86_use_any_reg): Kill.
	(TARGET_PENTIUM4): Define.
	(enum processor_type): Add PROCESSOR_PENTIUM4.
	(CPP_CPU_DEFAULT_SPEC): Add pentium4 support.
	* i386.md (attribute "cpu"): Add pentium4.

Index: i386.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.c,v
retrieving revision 1.224
diff -c -3 -p -r1.224 i386.c
*** i386.c	2001/02/25 16:35:48	1.224
--- i386.c	2001/02/27 13:18:36
*************** struct processor_costs athlon_cost = {
*** 239,244 ****
--- 239,276 ----
    6					/* MMX or SSE register to integer */
  };
  
+ struct processor_costs pentium4_cost = {
+   1,					/* cost of an add instruction */
+   1,					/* cost of a lea instruction */
+   8,					/* variable shift costs */
+   8,					/* constant shift costs */
+   30,					/* cost of starting a multiply */
+   0,					/* cost of multiply per each bit set */
+   112,					/* cost of a divide/mod */
+   16,					/* "large" insn */
+   6,					/* MOVE_RATIO */
+   2,					/* cost for loading QImode using movzbl */
+   {4, 5, 4},				/* cost of loading integer registers
+ 					   in QImode, HImode and SImode.
+ 					   Relative to reg-reg move (2).  */
+   {2, 3, 2},				/* cost of storing integer registers */
+   2,					/* cost of reg,reg fld/fst */
+   {2, 2, 6},				/* cost of loading fp registers
+ 					   in SFmode, DFmode and XFmode */
+   {4, 4, 6},				/* cost of loading integer registers */
+   2,					/* cost of moving MMX register */
+   {2, 2},				/* cost of loading MMX registers
+ 					   in SImode and DImode */
+   {2, 2},				/* cost of storing MMX registers
+ 					   in SImode and DImode */
+   12,					/* cost of moving SSE register */
+   {12, 12, 12},			/* cost of loading SSE registers
+ 					   in SImode, DImode and TImode */
+   {2, 2, 8},				/* cost of storing SSE registers
+ 					   in SImode, DImode and TImode */
+   10,					/* MMX or SSE register to integer */
+ };
+ 
  struct processor_costs *ix86_cost = &pentium_cost;
  
  /* Processor feature/optimization bitmasks.  */
*************** struct processor_costs *ix86_cost = &pen
*** 248,266 ****
  #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
  #define m_K6  (1<<PROCESSOR_K6)
  #define m_ATHLON  (1<<PROCESSOR_ATHLON)
  
  const int x86_use_leave = m_386 | m_K6 | m_ATHLON;
! const int x86_push_memory = m_386 | m_K6 | m_ATHLON;
  const int x86_zero_extend_with_and = m_486 | m_PENT;
! const int x86_movx = m_ATHLON | m_PPRO /* m_386 | m_K6 */;
  const int x86_double_with_add = ~m_386;
  const int x86_use_bit_test = m_386;
  const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6;
! const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
! const int x86_use_any_reg = m_486;
! const int x86_cmove = m_PPRO | m_ATHLON;
! const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON;
! const int x86_use_sahf = m_PPRO | m_K6;
  const int x86_partial_reg_stall = m_PPRO;
  const int x86_use_loop = m_K6;
  const int x86_use_fiop = ~(m_PPRO | m_ATHLON | m_PENT);
--- 280,297 ----
  #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
  #define m_K6  (1<<PROCESSOR_K6)
  #define m_ATHLON  (1<<PROCESSOR_ATHLON)
+ #define m_PENT4  (1<<PROCESSOR_PENTIUM4)
  
  const int x86_use_leave = m_386 | m_K6 | m_ATHLON;
! const int x86_push_memory = m_386 | m_K6 | m_ATHLON | m_PENT4;
  const int x86_zero_extend_with_and = m_486 | m_PENT;
! const int x86_movx = m_ATHLON | m_PPRO | m_PENT4 /* m_386 | m_K6 */;
  const int x86_double_with_add = ~m_386;
  const int x86_use_bit_test = m_386;
  const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6;
! const int x86_cmove = m_PPRO | m_ATHLON | m_PENT4;
! const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON | m_PENT4;
! const int x86_use_sahf = m_PPRO | m_K6 | m_PENT4;
  const int x86_partial_reg_stall = m_PPRO;
  const int x86_use_loop = m_K6;
  const int x86_use_fiop = ~(m_PPRO | m_ATHLON | m_PENT);
*************** const int x86_qimode_math = ~(0);
*** 275,287 ****
  const int x86_promote_qi_regs = 0;
  const int x86_himode_math = ~(m_PPRO);
  const int x86_promote_hi_regs = m_PPRO;
! const int x86_sub_esp_4 = m_ATHLON | m_PPRO;
! const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486;
! const int x86_add_esp_4 = m_ATHLON | m_K6;
! const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486;
! const int x86_integer_DFmode_moves = ~m_ATHLON;
! const int x86_partial_reg_dependency = m_ATHLON;
! const int x86_memory_mismatch_stall = m_ATHLON;
  
  #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
  
--- 306,318 ----
  const int x86_promote_qi_regs = 0;
  const int x86_himode_math = ~(m_PPRO);
  const int x86_promote_hi_regs = m_PPRO;
! const int x86_sub_esp_4 = m_ATHLON | m_PPRO | m_PENT4;
! const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486 | m_PENT4;
! const int x86_add_esp_4 = m_ATHLON | m_K6 | m_PENT4;
! const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4;
! const int x86_integer_DFmode_moves = ~(m_ATHLON | m_PENT4);
! const int x86_partial_reg_dependency = m_ATHLON | m_PENT4;
! const int x86_memory_mismatch_stall = m_ATHLON | m_PENT4;
  
  #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
  
*************** override_options ()
*** 577,583 ****
        {&pentium_cost, 0, 0, -4, -4, -4, 1},
        {&pentiumpro_cost, 0, 0, 4, -4, 4, 1},
        {&k6_cost, 0, 0, -5, -5, 4, 1},
!       {&athlon_cost, 0, 0, 4, -4, 4, 1}
      };
  
    static struct pta
--- 612,619 ----
        {&pentium_cost, 0, 0, -4, -4, -4, 1},
        {&pentiumpro_cost, 0, 0, 4, -4, 4, 1},
        {&k6_cost, 0, 0, -5, -5, 4, 1},
!       {&athlon_cost, 0, 0, 4, -4, 4, 1},
!       {&pentium4_cost, 0, 0, 2, 2, 2, 1}
      };
  
    static struct pta
*************** override_options ()
*** 595,600 ****
--- 631,637 ----
        {"pentiumpro", PROCESSOR_PENTIUMPRO},
        {"k6", PROCESSOR_K6},
        {"athlon", PROCESSOR_ATHLON},
+       {"pentium4", PROCESSOR_PENTIUM4},
      };
  
    int const pta_size = sizeof (processor_alias_table) / sizeof (struct pta);
*************** incdec_operand (op, mode)
*** 1202,1207 ****
--- 1239,1248 ----
       register rtx op;
       enum machine_mode mode;
  {
+   /* On Pentium4, the inc and dec operations causes extra dependancy on flag
+      registers, since carry flag is not set.  */
+   if (TARGET_PENTIUM4 && !optimize_size)
+     return 0;
    if (op == const1_rtx || op == constm1_rtx)
      return 1;
    if (GET_CODE (op) != CONST_INT)
*************** ix86_issue_rate ()
*** 6760,6765 ****
--- 7064,7071 ----
        return 2;
  
      case PROCESSOR_PENTIUMPRO:
+     case PROCESSOR_PENTIUM4:
+     case PROCESSOR_ATHLON:
        return 3;
  
      default:
Index: i386.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.h,v
retrieving revision 1.156
diff -c -3 -p -r1.156 i386.h
*** i386.h	2001/02/19 15:47:30	1.156
--- i386.h	2001/02/27 13:20:33
*************** extern int target_flags;
*** 184,194 ****
  #define TARGET_PENTIUMPRO (ix86_cpu == PROCESSOR_PENTIUMPRO)
  #define TARGET_K6 (ix86_cpu == PROCESSOR_K6)
  #define TARGET_ATHLON (ix86_cpu == PROCESSOR_ATHLON)
  
  #define CPUMASK (1 << ix86_cpu)
  extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
  extern const int x86_use_bit_test, x86_cmove, x86_deep_branch;
! extern const int x86_unroll_strlen, x86_use_q_reg, x86_use_any_reg;
  extern const int x86_double_with_add, x86_partial_reg_stall, x86_movx;
  extern const int x86_use_loop, x86_use_fiop, x86_use_mov0;
  extern const int x86_use_cltd, x86_read_modify_write;
--- 184,195 ----
  #define TARGET_PENTIUMPRO (ix86_cpu == PROCESSOR_PENTIUMPRO)
  #define TARGET_K6 (ix86_cpu == PROCESSOR_K6)
  #define TARGET_ATHLON (ix86_cpu == PROCESSOR_ATHLON)
+ #define TARGET_PENTIUM4 (ix86_cpu == PROCESSOR_PENTIUM4)
  
  #define CPUMASK (1 << ix86_cpu)
  extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
  extern const int x86_use_bit_test, x86_cmove, x86_deep_branch;
! extern const int x86_unroll_strlen;
  extern const int x86_double_with_add, x86_partial_reg_stall, x86_movx;
  extern const int x86_use_loop, x86_use_fiop, x86_use_mov0;
  extern const int x86_use_cltd, x86_read_modify_write;
*************** extern const int x86_partial_reg_depende
*** 204,211 ****
  #define TARGET_ZERO_EXTEND_WITH_AND (x86_zero_extend_with_and & CPUMASK)
  #define TARGET_USE_BIT_TEST (x86_use_bit_test & CPUMASK)
  #define TARGET_UNROLL_STRLEN (x86_unroll_strlen & CPUMASK)
- #define TARGET_USE_Q_REG (x86_use_q_reg & CPUMASK)
- #define TARGET_USE_ANY_REG (x86_use_any_reg & CPUMASK)
  /* For sane SSE instruction set generation we need fcomi instruction.  It is
     safe to enable all CMOVE instructions.  */
  #define TARGET_CMOVE ((x86_cmove & (1 << ix86_arch)) || TARGET_SSE)
--- 205,210 ----
*************** enum processor_type
*** 345,350 ****
--- 344,350 ----
    PROCESSOR_PENTIUMPRO,
    PROCESSOR_K6,
    PROCESSOR_ATHLON,
+   PROCESSOR_PENTIUM4,
    PROCESSOR_max
  };
  
*************** extern int ix86_arch;
*** 431,436 ****
--- 431,439 ----
  #if TARGET_CPU_DEFAULT == 5
  #define CPP_CPU_DEFAULT_SPEC "-D__tune_athlon__"
  #endif
+ #if TARGET_CPU_DEFAULT == 6
+ #define CPP_CPU_DEFAULT_SPEC "-D__tune_pentium4__"
+ #endif
  #ifndef CPP_CPU_DEFAULT_SPEC
  #define CPP_CPU_DEFAULT_SPEC "-D__tune_i386__"
  #endif
*************** extern int ix86_arch;
*** 449,460 ****
--- 452,465 ----
    %{!mcpu*:-D__tune_i686__ -D__tune_pentiumpro__ }}\
  %{march=k6:-D__k6 -D__k6__ %{!mcpu*:-D__tune_k6__ }}\
  %{march=athlon:-D__athlon -D__athlon__ %{!mcpu*:-D__tune_athlon__ }}\
+ %{mpentium4=pentium4:-D__pentium4 -D__pentium4__ %{!mcpu*:-D__tune_pentium4__ }}\
  %{m386|mcpu=i386:-D__tune_i386__ }\
  %{m486|mcpu=i486:-D__tune_i486__ }\
  %{mpentium|mcpu=pentium|mcpu=i586:-D__tune_i586__ -D__tune_pentium__ }\
  %{mpentiumpro|mcpu=pentiumpro|mcpu=i686:-D__tune_i686__ -D__tune_pentiumpro__ }\
  %{mcpu=k6:-D__tune_k6__ }\
  %{mcpu=athlon:-D__tune_athlon__ }\
+ %{mcpu=pentium4:-D__tune_pentium4__ }\
  %{!march*:%{!mcpu*:%{!m386:%{!m486:%{!mpentium*:%(cpp_cpu_default)}}}}}"
  #endif
  
Index: i386.md
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.md,v
retrieving revision 1.220
diff -c -3 -p -r1.220 i386.md
*** i386.md	2001/02/25 13:33:59	1.220
--- i386.md	2001/02/27 13:21:27
***************
*** 97,103 ****
  
  ;; Processor type.  This attribute must exactly match the processor_type
  ;; enumeration in i386.h.
! (define_attr "cpu" "i386,i486,pentium,pentiumpro,k6,athlon"
    (const (symbol_ref "ix86_cpu")))
  
  ;; A basic instruction type.  Refinements due to arguments to be
--- 97,103 ----
  
  ;; Processor type.  This attribute must exactly match the processor_type
  ;; enumeration in i386.h.
! (define_attr "cpu" "i386,i486,pentium,pentiumpro,k6,athlon,pentium4"
    (const (symbol_ref "ix86_cpu")))
  
  ;; A basic instruction type.  Refinements due to arguments to be


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]