This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Instruciton alignment for amd-k6 and new_ia32_branch


Hi
AMD-K6 optimization manual recommends to align FP instruction to avoid
straddling over the cache line boundary. In case instruction have first
byte on the old cache line and second byte on the next, there is not enought
predecode information and instruction becomes vector decoded (and makes
large multicycle penatly).
I've experimented with this by emmiting .p2align 5,,1 before each FP instruction
and as most optimizations recommended by K6 manual it had neutral efect, because
FP unit is slow and thus decoding penalties are not important.

Interestingly enought this turned out to be very important optimization
for integer code.  Quite common instruction as movsx have two byte opcode
and this helps to make them sage.  With this alignment I was able to enable
TARGET_MOVX for K6 and get large speedups in graphics loops etc due to reduced
dependancy. (previously it was hit/miss switch)
This is one of the most importnant K6 optimizations I've implemented.

This patch adds new target macro INSN_ALIGNMENT that can be used to set
alignment for each invidiual instruction.  Final.c then outputs necesary
code alignment.  I've choosed this solution, because it should make possible
for shorten_branches to take into account this value.  For now I am just
outputing the alignment in final pass.

Let me know your ideas.

Honza

Sun Jul 11 18:32:06 EDT 1999  Jan Hubicka  <hubicka@freesoft.cz>
	* i386.c (ix86_movx): Set for K6.
	(ix86_insn_alignment): New function.
	* i386.h (ix86_insn_alignment): Declare.
	(INSN_ALIGNMENT): New macro.
	* final.c (final_scan_insn): Output alignment when INSN_ALIGMENT
	defined.

*** /root/i386/i386.c	Sat Jul 10 11:07:24 1999
--- config/i386/i386.c	Sat Jul 10 20:36:48 1999
*************** struct processor_costs *ix86_cost = &pen
*** 120,126 ****
  const int x86_use_leave = m_386 | m_K6;
  const int x86_push_memory = m_386 | m_K6;
  const int x86_zero_extend_with_and = m_486 | m_PENT;
! const int x86_movx = 0 /* m_386 | m_PPRO | m_K6 */;
  const int x86_double_with_add = ~m_386;
  const int x86_use_bit_test = m_386;
  const int x86_unroll_strlen = m_486 | m_PENT;
--- 120,126 ----
  const int x86_use_leave = m_386 | m_K6;
  const int x86_push_memory = m_386 | m_K6;
  const int x86_zero_extend_with_and = m_486 | m_PENT;
! const int x86_movx = m_K6 /* m_386 | m_PPRO | m_K6 */;
  const int x86_double_with_add = ~m_386;
  const int x86_use_bit_test = m_386;
  const int x86_unroll_strlen = m_486 | m_PENT;
*************** legitimize_address (x, oldx, mode)
*** 2242,2247 ****
--- 2318,2376 ----
    return x;
  }
  
+ /* Return alignment of insn.  This function is called by final pass before
+    every insn is output and when LOG != 0 alignment pseudo is printed.
+   
+    AMD-K6 have large penalties, when predecode information (part of opcode
+    necesary to determine instruction length) straddle over then cache line.
+    So we output necesary alignment before each instruction with opcode
+    longer than two bytes (usually last byte of opcode is not important
+    except for the parameter-less instructions that are infrequent).  */
+ void
+ ix86_insn_alignment (insn, log, max)
+      rtx insn;
+      int *log;
+      int *max;
+ {
+   *log = 0;
+   *max = 0;
+   if (!TARGET_K6)
+     return;
+   if (recog_memoized (insn) < 0)
+     return;
+ #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
+   switch (get_attr_type (insn))
+     {
+     case TYPE_FMOV:
+     case TYPE_FOP:
+     case TYPE_FCMP:
+     case TYPE_FOP1:
+     case TYPE_FMUL:
+     case TYPE_FDIV:
+     case TYPE_FPSPC:
+     case TYPE_FCMOV:
+     case TYPE_FXCH:
+       /* Make sure that fp insns do not straddle accross the cache line.  */
+       *log = 5;
+       *max = 1;
+       break;
+     default:
+       /* Short decodable insns becomes vector decodable, when predecode
+          information straddle accross the cache line.  */
+       if (get_attr_ppro_uops (insn) != PPRO_UOPS_MANY2
+ 	/* Sadly we can't do good job for prefixed instructions, because the
+ 	   alignment ought to lie exactly between prefix and instruction.  */
+ 	  && !get_attr_length_prefix (insn)
+ 	  && get_attr_length_opcode (insn) > 2)
+ 	{
+ 	  *log = 5;
+ 	  *max = 1;
+ 	}
+       break;
+     }
+ #endif
+ }
+ 
  /* Print an integer constant expression in assembler syntax.  Addition
     and subtraction are the only arithmetic that may appear in these
     expressions.  FILE is the stdio stream to write to, X is the rtx, and
*** /root/i386/i386.h	Sat Jul 10 11:07:24 1999
--- config/i386/i386.h	Sat Jul 10 20:36:48 1999
*************** do { long l;						\
*** 2361,2366 ****
--- 2373,2380 ----
  #define RET return ""
  #define AT_SP(mode) (gen_rtx_MEM ((mode), stack_pointer_rtx))
  
+ #define INSN_ALIGNMENT(insn, log, max) ix86_insn_alignment (insn, log, max)
+ 
  /* Define the codes that are matched by predicates in i386.c.  */
  
  #define PREDICATE_CODES							\
*************** extern int ix86_attr_length_default XPAR
*** 2485,2492 ****
  extern int ix86_issue_rate XPARAMS((void));
  extern int ix86_adjust_cost XPARAMS((xrtx, xrtx, xrtx, int));
  extern void ix86_sched_init XPARAMS((FILE *, int));
  extern void ix86_sched_reorder XPARAMS((FILE *, int, xrtx *, int));
  extern int ix86_variable_issue XPARAMS((FILE *, int, xrtx, int));
  
  
  #undef XPARAMS
--- 2501,2511 ----
  extern int ix86_issue_rate XPARAMS((void));
  extern int ix86_adjust_cost XPARAMS((xrtx, xrtx, xrtx, int));
  extern void ix86_sched_init XPARAMS((FILE *, int));
  extern void ix86_sched_reorder XPARAMS((FILE *, int, xrtx *, int));
  extern int ix86_variable_issue XPARAMS((FILE *, int, xrtx, int));
+ 
+ extern void ix86_insn_alignment XPARAMS((rtx, int *, int *));
+ extern int ix86_adjust_align XPARAMS((rtx, int));
  
  
  #undef XPARAMS
*** /root/i386/final.c	Sun Jul 11 18:29:16 1999
--- final.c	Sat Jul 10 11:25:51 1999
*************** final_scan_insn (insn, file, optimize, p
*** 2812,2817 ****
--- 2812,2833 ----
  
  #endif
  
+ 	/* Ourtput optional alignment needed for insn. */
+ #ifdef INSN_ALIGNMENT
+ 	{
+ 	  int align, max_skip;
+ 	  INSN_ALIGNMENT (insn, &align, &max_skip);
+ 	  if (align)
+ 	    {
+ #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
+ 	      ASM_OUTPUT_MAX_SKIP_ALIGN (file, align, max_skip);
+ #else
+ 	      ASM_OUTPUT_ALIGN (file, align, max_skip);
+ #endif
+ 	    }
+ 	}
+ #endif
+ 
  #ifdef HAVE_peephole
  	/* Do machine-specific peephole optimizations if desired.  */
  


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]