Athlon specific optimization

Jan Hubicka jh@suse.cz
Mon Jul 15 07:41:00 GMT 2002


Hi,
this simple pass makes Athlon code to go considerably faster then frame
pointer is ommited (2-3% on SPECint score).  Richard, this situation is
the cause of performance penalty we seen on -momit-leaf-frame-pointer.

Honza

Wed Jul 10 15:01:28 CEST 2002  Jan Hubicka  <jh@suse.cz>
	* i386.h (MACHINE_DEPENDENT_REORG): New macro.
	* i386.c (x86_machine_dependent_reorg): New function.
	* i386-protos.h (x86_machine_dependent_reorg): Declare.
Index: i386.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.h,v
retrieving revision 1.275
diff -c -3 -p -r1.275 i386.h
*** i386.h	20 Jun 2002 19:07:42 -0000	1.275
--- i386.h	10 Jul 2002 13:00:47 -0000
*************** enum fp_cw_mode {FP_CW_STORED, FP_CW_UNI
*** 3353,3358 ****
--- 3353,3359 ----
     ((SRC) < FIRST_STACK_REG || (SRC) > LAST_STACK_REG)
  
  
+ #define MACHINE_DEPENDENT_REORG(X) x86_machine_dependent_reorg(X)
  /*
  Local variables:
  version-control: t
Index: i386-protos.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386-protos.h,v
retrieving revision 1.75
diff -c -3 -p -r1.75 i386-protos.h
*** i386-protos.h	16 Jun 2002 20:18:24 -0000	1.75
--- i386-protos.h	10 Jul 2002 13:00:47 -0000
*************** extern int x86_field_alignment PARAMS ((
*** 209,214 ****
--- 209,215 ----
  #endif
  
  extern rtx ix86_tls_get_addr PARAMS ((void));
+ extern void x86_machine_dependent_reorg PARAMS ((rtx));
  
  /* In winnt.c  */
  extern void i386_pe_encode_section_info PARAMS ((tree, int));
Index: i386.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.c,v
retrieving revision 1.429
diff -c -3 -p -r1.429 i386.c
*** i386.c	3 Jul 2002 14:15:44 -0000	1.429
--- i386.c	10 Jul 2002 13:00:54 -0000
*************** x86_field_alignment (field, computed)
*** 13666,13669 ****
--- 13666,13713 ----
    return computed;
  }
  
+ /* Implement machine specific optimizations.  
+    At the moment we implement single transformation: AMD Athlon works faster
+    when RET is not destination of conditional jump or directly preceeded
+    by other jump instruction.  We avoid the penalty by inserting NOP just
+    before the RET instructions in such cases.  */
+ void
+ x86_machine_dependent_reorg (first)
+      rtx first ATTRIBUTE_UNUSED;
+ {
+   basic_block bb;
+ 
+   if (!TARGET_ATHLON || !optimize || optimize_size)
+     return;
+   FOR_EACH_BB (bb)
+   {
+     rtx ret = bb->end;
+     rtx prev;
+     bool insert = false;
+ 
+     if (GET_CODE (ret) != JUMP_INSN || GET_CODE (PATTERN (ret)) != RETURN)
+       continue;
+     if (!maybe_hot_bb_p (bb))
+       continue;
+     prev = prev_nonnote_insn (ret);
+     if (prev && GET_CODE (prev) == CODE_LABEL)
+       {
+ 	edge e;
+ 	for (e = bb->pred; e; e = e->pred_next)
+ 	  if (EDGE_FREQUENCY (e) && e->src->index > 0
+ 	      && !(e->flags & EDGE_FALLTHRU))
+ 	    insert = 1;
+       }
+     if (!insert)
+       {
+ 	prev = prev_real_insn (ret);
+ 	if (prev && GET_CODE (prev) == JUMP_INSN
+ 	    && any_condjump_p (prev))
+ 	  insert = 1;
+       }
+     if (insert)
+       emit_insn_before (gen_nop (), ret);
+   }
+ }
+ 
  #include "gt-i386.h"



More information about the Gcc-patches mailing list