Athlon specific optimization
Jan Hubicka
jh@suse.cz
Mon Jul 15 07:41:00 GMT 2002
Hi,
this simple pass makes Athlon code to go considerably faster then frame
pointer is ommited (2-3% on SPECint score). Richard, this situation is
the cause of performance penalty we seen on -momit-leaf-frame-pointer.
Honza
Wed Jul 10 15:01:28 CEST 2002 Jan Hubicka <jh@suse.cz>
* i386.h (MACHINE_DEPENDENT_REORG): New macro.
* i386.c (x86_machine_dependent_reorg): New function.
* i386-protos.h (x86_machine_dependent_reorg): Declare.
Index: i386.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.h,v
retrieving revision 1.275
diff -c -3 -p -r1.275 i386.h
*** i386.h 20 Jun 2002 19:07:42 -0000 1.275
--- i386.h 10 Jul 2002 13:00:47 -0000
*************** enum fp_cw_mode {FP_CW_STORED, FP_CW_UNI
*** 3353,3358 ****
--- 3353,3359 ----
((SRC) < FIRST_STACK_REG || (SRC) > LAST_STACK_REG)
+ #define MACHINE_DEPENDENT_REORG(X) x86_machine_dependent_reorg(X)
/*
Local variables:
version-control: t
Index: i386-protos.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386-protos.h,v
retrieving revision 1.75
diff -c -3 -p -r1.75 i386-protos.h
*** i386-protos.h 16 Jun 2002 20:18:24 -0000 1.75
--- i386-protos.h 10 Jul 2002 13:00:47 -0000
*************** extern int x86_field_alignment PARAMS ((
*** 209,214 ****
--- 209,215 ----
#endif
extern rtx ix86_tls_get_addr PARAMS ((void));
+ extern void x86_machine_dependent_reorg PARAMS ((rtx));
/* In winnt.c */
extern void i386_pe_encode_section_info PARAMS ((tree, int));
Index: i386.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.c,v
retrieving revision 1.429
diff -c -3 -p -r1.429 i386.c
*** i386.c 3 Jul 2002 14:15:44 -0000 1.429
--- i386.c 10 Jul 2002 13:00:54 -0000
*************** x86_field_alignment (field, computed)
*** 13666,13669 ****
--- 13666,13713 ----
return computed;
}
+ /* Implement machine specific optimizations.
+ At the moment we implement single transformation: AMD Athlon works faster
+ when RET is not destination of conditional jump or directly preceeded
+ by other jump instruction. We avoid the penalty by inserting NOP just
+ before the RET instructions in such cases. */
+ void
+ x86_machine_dependent_reorg (first)
+ rtx first ATTRIBUTE_UNUSED;
+ {
+ basic_block bb;
+
+ if (!TARGET_ATHLON || !optimize || optimize_size)
+ return;
+ FOR_EACH_BB (bb)
+ {
+ rtx ret = bb->end;
+ rtx prev;
+ bool insert = false;
+
+ if (GET_CODE (ret) != JUMP_INSN || GET_CODE (PATTERN (ret)) != RETURN)
+ continue;
+ if (!maybe_hot_bb_p (bb))
+ continue;
+ prev = prev_nonnote_insn (ret);
+ if (prev && GET_CODE (prev) == CODE_LABEL)
+ {
+ edge e;
+ for (e = bb->pred; e; e = e->pred_next)
+ if (EDGE_FREQUENCY (e) && e->src->index > 0
+ && !(e->flags & EDGE_FALLTHRU))
+ insert = 1;
+ }
+ if (!insert)
+ {
+ prev = prev_real_insn (ret);
+ if (prev && GET_CODE (prev) == JUMP_INSN
+ && any_condjump_p (prev))
+ insert = 1;
+ }
+ if (insert)
+ emit_insn_before (gen_nop (), ret);
+ }
+ }
+
#include "gt-i386.h"
More information about the Gcc-patches
mailing list