This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Re: K6-Optimierungen im EGCS


Hi Jens-Uwe, hi Scott, hi egcs-people,

I append my latest AMD-K6 patches. Sorry for the delay. I was doing some
experiments with John Carr's K6 patches, and the results were quite
encouraging, but from time to time I got (reproducable) internal compiler
errors. I was able to fix one (caused by usage of cmove on K6), but not
another one. So I dropped working on this. Thanks why it took me some time.

So here is basically an updated version of the patch posted earlier. Some
more fine tuning done.

On Thu, Sep 17, 1998 at 05:16:05PM +0200, Jens-Uwe Rumstich wrote:
> Hi!
> 
> Gibt es eigentlich irgendwo eine leicht verstaendliche Einfuehrung
> in die i386.h/md/c-files? In den .Inf-files steht zwar einiges drinne,

Look into gcc/rtl.def for a description.
 
> Wo gibt man dem Compiler die Informationen fuer Instruction scheduling
> usw? 

cost in i386.c and the defs in i386.md

> Ansonsten, ich habe frueher recht viel in Assembler programmiert und
> optimiert, weiss also, was die CPUs moegen ;)  Weiterhin kenne ich
> dieses K6-2-Optimierungshandbuch.

Maybe you can write some simple test programs and look at the produced asm
code, if you want to take the effort.

If you want to handcode things, I got two more hints for you.
* If you have a K6-2, use the prefetch insn. It is perfectly mixable with
  FPU insns and can speed up things quite a lot. (I got 40% speedup for a
  matrix vector multiply with doubles!)
* Make sure to have your insns not cross cache line boundaries by inserting
  nop at the right places.

Also note that K6 is sometimes different from other CPUs. E.g. you are
certainly used to use "xorl %eax,%eax" to clear the eax register. K6 prefers
"movl 0,%eax", though. But the K6 optim. manual is your friend.

-- 
+--------------------------------------------------------------+
|  cand.phys. Kurt Garloff        C1-O4-101 (Chemiegeb"aude)   |
|Lehrstuhl Hochfrequenztechnik  (Fakult"at f"ur Elektrotechnik)|
|   Universit"at Dortmund             D - 44221 Dortmund       |
|   Tel.: (0231) 755-3947            Fax: (0231) 755-4631      |
|            garloff@hft.e-technik.uni-dortmund.de             |
|                 Public PGP key available !                   | 
+--------------------------------------------------------------+
--- egcs-1.1a/gcc/config/i386/i386.c.orig	Wed Jul 29 00:31:09 1998
+++ egcs-1.1a/gcc/config/i386/i386.c	Thu Sep  3 20:56:45 1998
@@ -100,7 +100,17 @@
   17					/* cost of a divide/mod */
 };
 
-struct processor_costs *ix86_cost = &pentium_cost;
+struct processor_costs amdk6_cost = {
+  1,					/* cost of an add instruction */
+  1,					/* cost of a lea instruction */
+  3,					/* variable shift costs */
+  2,					/* constant shift costs */
+  4,					/* cost of starting a multiply */
+  0,					/* cost of multiply per each bit set */
+  18					/* cost of a divide/mod */
+};
+
+struct processor_costs *ix86_cost = &pentiumpro_cost;
 
 #define AT_BP(mode) (gen_rtx_MEM ((mode), frame_pointer_rtx))
 
@@ -213,7 +223,9 @@
 	   {PROCESSOR_I686_STRING, PROCESSOR_PENTIUMPRO, &pentiumpro_cost,
 	      0, 0},
 	   {PROCESSOR_PENTIUMPRO_STRING, PROCESSOR_PENTIUMPRO,
-	      &pentiumpro_cost, 0, 0}};
+	      &pentiumpro_cost, 0, 0},
+      	   {PROCESSOR_AMDK6_STRING, PROCESSOR_AMDK6,
+	      &amdk6_cost, 0, 0}};
 
   int ptt_size = sizeof (processor_target_table) / sizeof (struct ptt);
 
@@ -306,7 +318,7 @@
 
   /* The 486 suffers more from non-aligned cache line fills, and the
      larger code size results in a larger cache foot-print and more misses.
-     The 486 has a 16 byte cache line, pentium and pentiumpro have a 32 byte
+     The 486 has a 16 byte cache line, pentium, pentiumpro and K6 have a 32 byte
      cache line.  */
   def_align = (TARGET_486) ? 4 : 2;
 
@@ -349,7 +361,7 @@
 	       i386_align_funcs, MAX_CODE_ALIGN);
     }
   else
-    i386_align_funcs = def_align;
+     i386_align_funcs = TARGET_AMDK6 ? 4: def_align;
 
   /* Validate -mbranch-cost= value, or provide default. */
   if (i386_branch_cost_string)
--- egcs-1.1a/gcc/config/i386/i386.h.orig	Sun Jul 26 02:35:12 1998
+++ egcs-1.1a/gcc/config/i386/i386.h	Thu Sep  3 20:56:53 1998
@@ -1,5 +1,5 @@
 /* Definitions of target machine for GNU compiler for Intel X86
-   (386, 486, Pentium).
+   (386, 486, Pentium, PentiumPro, AmdK6).
    Copyright (C) 1988, 92, 94, 95, 96, 97, 1998 Free Software Foundation, Inc.
 
 This file is part of GNU CC.
@@ -155,20 +155,24 @@
 #define TARGET_486 (ix86_cpu == PROCESSOR_I486)
 #define TARGET_PENTIUM (ix86_cpu == PROCESSOR_PENTIUM)
 #define TARGET_PENTIUMPRO (ix86_cpu == PROCESSOR_PENTIUMPRO)
-#define TARGET_USE_LEAVE (ix86_cpu == PROCESSOR_I386)
-#define TARGET_PUSH_MEMORY (ix86_cpu == PROCESSOR_I386)
+#define TARGET_AMDK6 (ix86_cpu == PROCESSOR_AMDK6)
+
+#define TARGET_USE_LEAVE (ix86_cpu == PROCESSOR_I386 || ix86_cpu == PROCESSOR_AMDK6)
+#define TARGET_PUSH_MEMORY (ix86_cpu == PROCESSOR_I386 || ix86_cpu == PROCESSOR_AMDK6)
 #define TARGET_ZERO_EXTEND_WITH_AND (ix86_cpu != PROCESSOR_I386 \
-				     && ix86_cpu != PROCESSOR_PENTIUMPRO)
+				     && ix86_cpu != PROCESSOR_PENTIUMPRO && ix86_cpu != PROCESSOR_AMDK6)
 #define TARGET_DOUBLE_WITH_ADD (ix86_cpu != PROCESSOR_I386)
-#define TARGET_USE_BIT_TEST (ix86_cpu == PROCESSOR_I386)
+#define TARGET_USE_BIT_TEST (ix86_cpu == PROCESSOR_I386) /* || ix86_cpu == PROCESSOR_AMDK5) */
 #define TARGET_UNROLL_STRLEN (ix86_cpu != PROCESSOR_I386)
 #define TARGET_USE_Q_REG (ix86_cpu == PROCESSOR_PENTIUM \
-			  || ix86_cpu == PROCESSOR_PENTIUMPRO)
-#define TARGET_USE_ANY_REG (ix86_cpu == PROCESSOR_I486)
+			  || ix86_cpu == PROCESSOR_PENTIUMPRO /*|| ix86_cpu == PROCESSOR_AMDK6*/)
+#define TARGET_USE_ANY_REG (ix86_cpu == PROCESSOR_I486 || ix86_cpu == PROCESSOR_AMDK6)
 #define TARGET_CMOVE (ix86_arch == PROCESSOR_PENTIUMPRO)
-#define TARGET_DEEP_BRANCH_PREDICTION (ix86_cpu == PROCESSOR_PENTIUMPRO)
+#define TARGET_DEEP_BRANCH_PREDICTION (ix86_cpu == PROCESSOR_PENTIUMPRO || ix86_cpu == PROCESSOR_AMDK6)
 #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
 
+#define ISSUE_RATE (TARGET_AMDK6 ? 2 : 1)
+
 #define TARGET_SWITCHES							\
 { { "80387",			 MASK_80387 },				\
   { "no-80387",			-MASK_80387 },				\
@@ -181,6 +185,7 @@
   { "no-486",			 0 },					\
   { "pentium",			 0 },					\
   { "pentiumpro",		 0 },					\
+  { "amdk6",			 0 },					\
   { "rtd",			 MASK_RTD },				\
   { "no-rtd",			-MASK_RTD },				\
   { "align-double",		 MASK_ALIGN_DOUBLE },			\
@@ -219,7 +224,8 @@
  {PROCESSOR_I386,			/* 80386 */
   PROCESSOR_I486,			/* 80486DX, 80486SX, 80486DX[24] */
   PROCESSOR_PENTIUM,
-  PROCESSOR_PENTIUMPRO};
+  PROCESSOR_PENTIUMPRO,
+  PROCESSOR_AMDK6};
 
 #define PROCESSOR_I386_STRING "i386"
 #define PROCESSOR_I486_STRING "i486"
@@ -227,6 +233,7 @@
 #define PROCESSOR_PENTIUM_STRING "pentium"
 #define PROCESSOR_I686_STRING "i686"
 #define PROCESSOR_PENTIUMPRO_STRING "pentiumpro"
+#define PROCESSOR_AMDK6_STRING "amdk6"
 
 extern enum processor_type ix86_cpu;
 
@@ -240,6 +247,8 @@
 					       ? PROCESSOR_PENTIUM  \
   : ((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_PENTIUMPRO) \
 					       ? PROCESSOR_PENTIUMPRO  \
+  : ((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_AMDK6) \
+					       ? PROCESSOR_AMDK6  \
   : PROCESSOR_I386
 #define PROCESSOR_DEFAULT_STRING \
   ((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_I486) \
@@ -248,6 +257,8 @@
 					       ? PROCESSOR_PENTIUM_STRING  \
   : ((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_PENTIUMPRO) \
 					       ? PROCESSOR_PENTIUMPRO_STRING  \
+  : ((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_AMDK6_STRING) \
+					       ? PROCESSOR_AMDK6_STRING  \
   : PROCESSOR_I386_STRING
 
 /* This macro is similar to `TARGET_SWITCHES' but defines names of
@@ -301,7 +312,9 @@
 %{mno-pentium:-mcpu=i486 -march=i486} \
 %{mpentium:-mcpu=pentium} \
 %{mno-pentiumpro:-mcpu=pentium} \
-%{mpentiumpro:-mcpu=pentiumpro}}"
+%{mpentiumpro:-mcpu=pentiumpro} \
+%{mno-amdk6:-mcpu=pentium} \
+%{mamdk6:-mcpu=amdk6}}"
 #endif
 
 #define CPP_486_SPEC "%{!ansi:-Di486} -D__i486 -D__i486__"
@@ -309,6 +322,8 @@
 	-D__i586 -D__i586__ -D__pentium -D__pentium__"
 #define CPP_686_SPEC "%{!ansi:-Di686 -Dpentiumpro} \
 	-D__i686 -D__i686__ -D__pentiumpro -D__pentiumpro__"
+#define CPP_AMDK6_SPEC "%{!ansi:-Di686 -Damdk6} \
+	-D__i586 -D__i586__ -D__amdk6 -D__amdk6__"
 
 #ifndef CPP_CPU_DEFAULT_SPEC
 #if TARGET_CPU_DEFAULT == 1
@@ -320,10 +335,14 @@
 #if TARGET_CPU_DEFAULT == 3
 #define CPP_CPU_DEFAULT_SPEC "%(cpp_686)"
 #else
+#if TARGET_CPU_DEFAULT == 4
+#define CPP_CPU_DEFAULT_SPEC "%(cpp_amdk6)"
+#else
 #define CPP_CPU_DEFAULT_SPEC ""
 #endif
 #endif
 #endif
+#endif
 #endif /* CPP_CPU_DEFAULT_SPEC */
 
 #ifndef CPP_CPU_SPEC
@@ -333,6 +352,7 @@
 %{mcpu=i486:%(cpp_486)} %{m486:%(cpp_486)} \
 %{mpentium:%(cpp_586)} %{mcpu=pentium:%(cpp_586)} \
 %{mpentiumpro:%(cpp_686)} %{mcpu=pentiumpro:%(cpp_686)} \
+%{mamdk6:%(cpp_amdk6)} %{mcpu=amdk6:%(cpp_amdk6)} \
 %{!mcpu*:%{!m486:%{!mpentium*:%(cpp_cpu_default)}}}"
 #endif
 
@@ -358,6 +378,7 @@
   { "cpp_486", CPP_486_SPEC},						\
   { "cpp_586", CPP_586_SPEC},						\
   { "cpp_686", CPP_686_SPEC},						\
+  { "cpp_amdk6", CPP_AMDK6_SPEC},					\
   { "cpp_cpu_default",	CPP_CPU_DEFAULT_SPEC },				\
   { "cpp_cpu",	CPP_CPU_SPEC },						\
   { "cc1_cpu",  CC1_CPU_SPEC },						\
--- egcs-1.1a/gcc/config/i386/i386.md.orig	Thu Jul 30 02:01:35 1998
+++ egcs-1.1a/gcc/config/i386/i386.md	Thu Sep  3 20:57:18 1998
@@ -97,6 +97,10 @@
  3 0)
 
 (define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fpop") (eq_attr "cpu" "amdk6"))
+ 2 0)
+
+(define_function_unit "fp" 1 0
  (and (eq_attr "type" "fpmul") (eq_attr "cpu" "pentium")) 
  7 0)
 
@@ -105,6 +109,14 @@
  5 0)
 
 (define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fpmul") (eq_attr "cpu" "pentiumpro")) 
+ 4 0)
+
+(define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fpmul") (eq_attr "cpu" "amdk6")) 
+ 2 0)
+
+(define_function_unit "fp" 1 0
  (and (eq_attr "type" "idiv") (eq_attr "cpu" "pentiumpro")) 
  10 10)
 
@@ -113,13 +125,21 @@
  6 0)
 
 (define_function_unit "fp" 1 0
- (eq_attr "type" "fpdiv") 
+ (and (eq_attr "type" "fpdiv") (eq_attr "cpu" "!amdk6"))
  10 10)
 
 (define_function_unit "fp" 1 0
- (eq_attr "type" "fld") 
+ (and (eq_attr "type" "fpdiv") (eq_attr "cpu" "amdk6"))
+ 10 9)
+
+(define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fld") (eq_attr "cpu" "!amdk6"))
  1 0)
 
+(define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fld") (eq_attr "cpu" "amdk6"))
+ 2 0)
+
 (define_function_unit "integer" 1 0
   (and (eq_attr "type" "integer") (eq_attr "cpu" "!i386"))
  2 0)
@@ -140,7 +160,7 @@
 ;; Processor type -- this attribute must exactly match the processor_type
 ;; enumeration in i386.h.
 
-(define_attr "cpu" "i386,i486,pentium,pentiumpro"
+(define_attr "cpu" "i386,i486,pentium,pentiumpro,amdk6"
   (const (symbol_ref "ix86_cpu")))
 
 (define_insn "tstsi_1"
@@ -989,7 +1009,7 @@
 	  operands[1] = i386_sext16_if_const (operands[1]);
 	  return AS2 (mov%L0,%k1,%k0);
 	}
-      if (TARGET_PENTIUMPRO)
+      if (TARGET_PENTIUMPRO || TARGET_AMDK6)
 	{
 	  /* movzwl is faster than movw on the Pentium Pro,
 	   * although not as fast as an aligned movl. */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]