This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
AMD K6 support
- To: egcs at cygnus dot com
- Subject: AMD K6 support
- From: Kurt Garloff <garloff at hft dot e-technik dot uni-dortmund dot de>
- Date: Wed, 19 Aug 1998 13:44:23 +0200
Hi everybody,
I found my AMD K6 perform best with -O3 -mpentiumpro -malign-functions=4
-fschedule-insns2 -ffast-math on the numerical calculations I make.
As I don't like specify -mpentiumpro -malign-functions=4 all the time, I
decided to create a -mamdk6 to basically select the correct values for
scheduling etc.
I looked into the K6 optimization docs, played a little bit with the
params in config/i386/i386.{md,c,h} and created a patch. I get some better
performance than with -mppro -ma-f=4.
The patch is appended. Just enjoy, if you got a AMD K6.
Maybe some more experiments with non-numerical code could help to get even
better results, but I wasn't able to find params to produce faster code, so
far.
Jeff: I'd really be lucky to see this in egcs.
I hereby declare, that this little patch was entirely written by me and I
hereby release it to the public under the terms of the GNU GPL.
Regards,
--
+--------------------------------------------------------------+
| cand.phys. Kurt Garloff C1-O4-101 (Chemiegeb"aude) |
|Lehrstuhl Hochfrequenztechnik (Fakult"at f"ur Elektrotechnik)|
| Universit"at Dortmund D - 44221 Dortmund |
| Tel.: (0231) 755-3947 Fax: (0231) 755-4631 |
| garloff@hft.e-technik.uni-dortmund.de |
| Public PGP key available ! |
+--------------------------------------------------------------+
P.S.:
Just for your information:
I also handcoded some routines when experimenting. In contrast to what
AMD docs say, you can use at least one of the new AMD-3DNow! instructions
without getting a penalty for the switch between FPU and MMX/3D: prefetch
It greatly improved (factor 1.4) my double(!) Matrix Vector multiplication by
prefetching the next cache line before using it.
--- gcc/ChangeLog.orig Wed Aug 12 10:56:17 1998
+++ gcc/ChangeLog Mon Aug 17 09:06:01 1998
@@ -1,3 +1,10 @@
+Mon Aug 17 11:03:50 1998 Kurt Garloff (K.Garloff@ping.de)
+
+ * config/i386/i386.md: Added description and scheduling params
+ for AMD K6
+ * config/i386/i386.h: Likewise
+ * config/i386/i386.c: Likewise
+
Mon Aug 3 23:43:55 PDT 1998 Jeff Law (law@cygnus.com)
* version.c: Bump for snapshot.
--- gcc/config/i386/i386.h.orig Wed Aug 12 10:56:06 1998
+++ gcc/config/i386/i386.h Mon Aug 17 08:50:06 1998
@@ -1,5 +1,5 @@
/* Definitions of target machine for GNU compiler for Intel X86
- (386, 486, Pentium).
+ (386, 486, Pentium, PentiumPro, AmdK6).
Copyright (C) 1988, 92, 94, 95, 96, 97, 1998 Free Software Foundation, Inc.
This file is part of GNU CC.
@@ -155,20 +155,23 @@
#define TARGET_486 (ix86_cpu == PROCESSOR_I486)
#define TARGET_PENTIUM (ix86_cpu == PROCESSOR_PENTIUM)
#define TARGET_PENTIUMPRO (ix86_cpu == PROCESSOR_PENTIUMPRO)
+#define TARGET_AMDK6 (ix86_cpu == PROCESSOR_AMDK6)
#define TARGET_USE_LEAVE (ix86_cpu == PROCESSOR_I386)
-#define TARGET_PUSH_MEMORY (ix86_cpu == PROCESSOR_I386)
+#define TARGET_PUSH_MEMORY (ix86_cpu == PROCESSOR_I386 || ix86_cpu == PROCESSOR_AMDK6)
#define TARGET_ZERO_EXTEND_WITH_AND (ix86_cpu != PROCESSOR_I386 \
- && ix86_cpu != PROCESSOR_PENTIUMPRO)
+ && ix86_cpu != PROCESSOR_PENTIUMPRO && ix86_cpu != PROCESSOR_AMDK6)
#define TARGET_DOUBLE_WITH_ADD (ix86_cpu != PROCESSOR_I386)
-#define TARGET_USE_BIT_TEST (ix86_cpu == PROCESSOR_I386)
+#define TARGET_USE_BIT_TEST (ix86_cpu == PROCESSOR_I386) /* || ix86_cpu == PROCESSOR_AMDK5) */
#define TARGET_UNROLL_STRLEN (ix86_cpu != PROCESSOR_I386)
#define TARGET_USE_Q_REG (ix86_cpu == PROCESSOR_PENTIUM \
- || ix86_cpu == PROCESSOR_PENTIUMPRO)
+ || ix86_cpu == PROCESSOR_PENTIUMPRO || ix86_cpu == PROCESSOR_AMDK6)
#define TARGET_USE_ANY_REG (ix86_cpu == PROCESSOR_I486)
#define TARGET_CMOVE (ix86_arch == PROCESSOR_PENTIUMPRO)
-#define TARGET_DEEP_BRANCH_PREDICTION (ix86_cpu == PROCESSOR_PENTIUMPRO)
+#define TARGET_DEEP_BRANCH_PREDICTION (ix86_cpu == PROCESSOR_PENTIUMPRO || ix86_cpu == PROCESSOR_AMDK6)
#define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
+#define ISSUE_RATE (TARGET_AMDK6 ? 2 : 1)
+
#define TARGET_SWITCHES \
{ { "80387", MASK_80387 }, \
{ "no-80387", -MASK_80387 }, \
@@ -181,6 +184,7 @@
{ "no-486", 0 }, \
{ "pentium", 0 }, \
{ "pentiumpro", 0 }, \
+ { "amdk6", 0 }, \
{ "rtd", MASK_RTD }, \
{ "no-rtd", -MASK_RTD }, \
{ "align-double", MASK_ALIGN_DOUBLE }, \
@@ -219,7 +223,8 @@
{PROCESSOR_I386, /* 80386 */
PROCESSOR_I486, /* 80486DX, 80486SX, 80486DX[24] */
PROCESSOR_PENTIUM,
- PROCESSOR_PENTIUMPRO};
+ PROCESSOR_PENTIUMPRO,
+ PROCESSOR_AMDK6};
#define PROCESSOR_I386_STRING "i386"
#define PROCESSOR_I486_STRING "i486"
@@ -227,6 +232,7 @@
#define PROCESSOR_PENTIUM_STRING "pentium"
#define PROCESSOR_I686_STRING "i686"
#define PROCESSOR_PENTIUMPRO_STRING "pentiumpro"
+#define PROCESSOR_AMDK6_STRING "amdk6"
extern enum processor_type ix86_cpu;
@@ -240,6 +246,8 @@
? PROCESSOR_PENTIUM \
: ((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_PENTIUMPRO) \
? PROCESSOR_PENTIUMPRO \
+ : ((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_AMDK6) \
+ ? PROCESSOR_AMDK6 \
: PROCESSOR_I386
#define PROCESSOR_DEFAULT_STRING \
((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_I486) \
@@ -248,6 +256,8 @@
? PROCESSOR_PENTIUM_STRING \
: ((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_PENTIUMPRO) \
? PROCESSOR_PENTIUMPRO_STRING \
+ : ((enum processor_type) TARGET_CPU_DEFAULT == PROCESSOR_AMDK6_STRING) \
+ ? PROCESSOR_AMDK6_STRING \
: PROCESSOR_I386_STRING
/* This macro is similar to `TARGET_SWITCHES' but defines names of
@@ -301,7 +311,9 @@
%{mno-pentium:-mcpu=i486 -march=i486} \
%{mpentium:-mcpu=pentium} \
%{mno-pentiumpro:-mcpu=pentium} \
-%{mpentiumpro:-mcpu=pentiumpro}}"
+%{mpentiumpro:-mcpu=pentiumpro} \
+%{mnoamdk6:-mcpu=pentium} \
+%{mamdk6:-mcpu=amdk6}}"
#endif
#define CPP_486_SPEC "%{!ansi:-Di486} -D__i486 -D__i486__"
@@ -309,6 +321,8 @@
-D__i586 -D__i586__ -D__pentium -D__pentium__"
#define CPP_686_SPEC "%{!ansi:-Di686 -Dpentiumpro} \
-D__i686 -D__i686__ -D__pentiumpro -D__pentiumpro__"
+#define CPP_AMDK6_SPEC "%{!ansi:-Di686 -Damdk6} \
+ -D__i586 -D__i586__ -D__amd6 -D__amdk6__"
#ifndef CPP_CPU_DEFAULT_SPEC
#if TARGET_CPU_DEFAULT == 1
@@ -320,10 +334,14 @@
#if TARGET_CPU_DEFAULT == 3
#define CPP_CPU_DEFAULT_SPEC "%(cpp_686)"
#else
+#if TARGET_CPU_DEFAULT == 4
+#define CPP_CPU_DEFAULT_SPEC "%(cpp_amdk6)"
+#else
#define CPP_CPU_DEFAULT_SPEC ""
#endif
#endif
#endif
+#endif
#endif /* CPP_CPU_DEFAULT_SPEC */
#ifndef CPP_CPU_SPEC
@@ -333,6 +351,7 @@
%{mcpu=i486:%(cpp_486)} %{m486:%(cpp_486)} \
%{mpentium:%(cpp_586)} %{mcpu=pentium:%(cpp_586)} \
%{mpentiumpro:%(cpp_686)} %{mcpu=pentiumpro:%(cpp_686)} \
+%{mamdk6:%(cpp_amdk6)} %{mcpu=admk6:%(cpp_amdk6)} \
%{!mcpu*:%{!m486:%{!mpentium*:%(cpp_cpu_default)}}}"
#endif
@@ -358,6 +377,7 @@
{ "cpp_486", CPP_486_SPEC}, \
{ "cpp_586", CPP_586_SPEC}, \
{ "cpp_686", CPP_686_SPEC}, \
+ { "cpp_amdk6", CPP_AMDK6_SPEC}, \
{ "cpp_cpu_default", CPP_CPU_DEFAULT_SPEC }, \
{ "cpp_cpu", CPP_CPU_SPEC }, \
{ "cc1_cpu", CC1_CPU_SPEC }, \
--- gcc/config/i386/i386.c.orig Wed Aug 12 10:56:17 1998
+++ gcc/config/i386/i386.c Mon Aug 17 07:50:41 1998
@@ -100,7 +100,17 @@
17 /* cost of a divide/mod */
};
-struct processor_costs *ix86_cost = &pentium_cost;
+struct processor_costs amdk6_cost = {
+ 1, /* cost of an add instruction */
+ 1, /* cost of a lea instruction */
+ 3, /* variable shift costs */
+ 2, /* constant shift costs */
+ 4, /* cost of starting a multiply */
+ 0, /* cost of multiply per each bit set */
+ 17 /* cost of a divide/mod */
+};
+
+struct processor_costs *ix86_cost = &pentiumpro_cost;
#define AT_BP(mode) (gen_rtx_MEM ((mode), frame_pointer_rtx))
@@ -213,7 +223,9 @@
{PROCESSOR_I686_STRING, PROCESSOR_PENTIUMPRO, &pentiumpro_cost,
0, 0},
{PROCESSOR_PENTIUMPRO_STRING, PROCESSOR_PENTIUMPRO,
- &pentiumpro_cost, 0, 0}};
+ &pentiumpro_cost, 0, 0},
+ {PROCESSOR_AMDK6_STRING, PROCESSOR_AMDK6,
+ &amdk6_cost, 0, 0}};
int ptt_size = sizeof (processor_target_table) / sizeof (struct ptt);
@@ -349,7 +361,7 @@
i386_align_funcs, MAX_CODE_ALIGN);
}
else
- i386_align_funcs = def_align;
+ i386_align_funcs = (TARGET_AMDK6 ? 4 : def_align);
/* Validate -mbranch-cost= value, or provide default. */
if (i386_branch_cost_string)
--- gcc/config/i386/i386.md.orig Wed Aug 12 10:56:17 1998
+++ gcc/config/i386/i386.md Wed Aug 19 08:47:31 1998
@@ -97,6 +97,10 @@
3 0)
(define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fpop") (eq_attr "cpu" "amdk6"))
+ 2 0)
+
+(define_function_unit "fp" 1 0
(and (eq_attr "type" "fpmul") (eq_attr "cpu" "pentium"))
7 0)
@@ -105,18 +109,38 @@
5 0)
(define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fpmul") (eq_attr "cpu" "pentiumpro"))
+ 4 0)
+
+(define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fpmul") (eq_attr "cpu" "amdk6"))
+ 5 0)
+
+(define_function_unit "fp" 1 0
(and (eq_attr "type" "idiv") (eq_attr "cpu" "pentiumpro"))
10 10)
(define_function_unit "fp" 1 0
+ (and (eq_attr "type" "idiv") (eq_attr "cpu" "amdk6"))
+ 10 9)
+
+(define_function_unit "fp" 1 0
(and (eq_attr "type" "imul") (eq_attr "cpu" "pentiumpro"))
6 0)
(define_function_unit "fp" 1 0
- (eq_attr "type" "fpdiv")
+ (and (eq_attr "type" "imul") (eq_attr "cpu" "amdk6"))
+ 5 0)
+
+(define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fpdiv") (eq_attr "cpu" "!amdk6"))
10 10)
(define_function_unit "fp" 1 0
+ (and (eq_attr "type" "fpdiv") (eq_attr "cpu" "amdk6"))
+ 10 9)
+
+(define_function_unit "fp" 1 0
(eq_attr "type" "fld")
1 0)
@@ -140,7 +164,7 @@
;; Processor type -- this attribute must exactly match the processor_type
;; enumeration in i386.h.
-(define_attr "cpu" "i386,i486,pentium,pentiumpro"
+(define_attr "cpu" "i386,i486,pentium,pentiumpro,amdk6"
(const (symbol_ref "ix86_cpu")))
(define_insn "tstsi_1"
@@ -989,7 +1013,7 @@
operands[1] = i386_sext16_if_const (operands[1]);
return AS2 (mov%L0,%k1,%k0);
}
- if (TARGET_PENTIUMPRO)
+ if (TARGET_PENTIUMPRO || TARGET_AMDK6)
{
/* movzwl is faster than movw on the Pentium Pro,
* although not as fast as an aligned movl. */