This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
i386 ashlsi3 improvements

To: egcs-patches at egcs dot cygnus dot com
Subject: i386 ashlsi3 improvements
From: Jeffrey A Law <law at hurl dot cygnus dot com>
Date: Tue, 09 Mar 1999 00:33:36 -0700
Reply-To: law at cygnus dot com

These are some minor improvements to the ashlsi3 support for the Pentium
and PPro processors.  These changes were discussed on egcs-patches back in
Feb.  I've updated them to account for some of the feedback I've received.

Until reg-stack is fixed, we can't determine when lea is profitable on the
Pentium from an issue standpoint, so that particular optimization is not
yet enabled.

        * i386.md (ashlsi3): Revise comments.  Provide new anonymous
        pattern for Pentium and PPro/PII.  Reverse constraints in 
        generic ashlsi3 anonymous pattern.


Index: i386.md
===================================================================
RCS file: /egcs/carton/cvsfiles/egcs/gcc/config/i386/i386.md,v
retrieving revision 1.50
diff -c -3 -p -r1.50 i386.md
*** i386.md	1999/02/23 10:22:57	1.50
--- i386.md	1999/03/09 07:30:25
*************** byte_xor_operation:
*** 4709,4718 ****
    RET;
  }")
  
- ;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg"
- ;; On i486, movl/sall appears slightly faster than leal, but the leal
- ;; is smaller - use leal for now unless the shift count is 1.
- 
  (define_expand "ashlsi3"
    [(set (match_operand:SI 0 "nonimmediate_operand" "")
  	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "")
--- 4709,4714 ----
*************** byte_xor_operation:
*** 4720,4738 ****
    ""
    "")
  
! ;; For register destinations:
! ;;   add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes
  ;;
! ;;   lea loses when optimizing for size
  ;;
! ;; Do the math.  If the count is 1, using add, else using sal will
! ;; produce the smallest possible code, even when the source and
! ;; dest do not match.  For a memory destination, sal is the only
! ;; choice.
! ;;
! ;; Do not try to handle case where src and dest do not match.  Let regmove
! ;; and reload handle them.  A mov followed by this insn will generate the
! ;; desired size optimized results.
  (define_insn ""
    [(set (match_operand:SI 0 "nonimmediate_operand" "=rm")
  	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0")
--- 4716,4735 ----
    ""
    "")
  
! ;; Optimizing for code size:
! ;;   For regsiter destinations:
! ;;     add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes
! ;;
! ;;     lea loses when optimizing for size
  ;;
! ;;   Do the math.  If the count is 1, using add, else using sal will
! ;;   produce the smallest possible code, even when the source and
! ;;   dest do not match.  For a memory destination, sal is the only
! ;;   choice.
  ;;
! ;;   Do not try to handle case where src and dest do not match.  Let regmove
! ;;   and reload handle them.  A mov followed by this insn will generate the
! ;;   desired size optimized results.
  (define_insn ""
    [(set (match_operand:SI 0 "nonimmediate_operand" "=rm")
  	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0")
*************** byte_xor_operation:
*** 4748,4758 ****
    return AS2 (sal%L0,%2,%0);
  }")
  
  (define_insn ""
!   [(set (match_operand:SI 0 "nonimmediate_operand" "=r,rm")
! 	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "r,0")
! 		   (match_operand:SI 2 "nonmemory_operand" "M,cI")))]
!   "! optimize_size"
    "*
  {
    if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))
--- 4745,4888 ----
    return AS2 (sal%L0,%2,%0);
  }")
  
+ ;; For Pentium/Pentium MMX:
+ ;;
+ ;;   We want to optimize for pairability, but avoid generating AGI stalls.
+ ;;
+ ;;   If this insn is expected to issue in the U pipe, then prefer sal,
+ ;;   else prefer lea for small shifts when srcreg == dstreg.
+ ;;
+ ;; For PPro/PII
+ ;;
+ ;;   There's more than one approach to optimizing for this family; it is
+ ;;   unclear which approach is best.  For now, we will try to minimize
+ ;;   uops.  Note that sal and lea have the same characteristics, so we
+ ;;   prefer sal as it takes less space.
+ ;;
+ ;; We can actually share code for these two cases since the basic techniques
+ ;; for generating good code on these chips is the same, even if the final
+ ;; code sequences are different.
+ ;;
+ ;; I do not know what is most appropriate for the AMD or Cyrix chips.
+ ;;
+ ;;   srcreg == dstreg, constant shift count:
+ ;;
+ ;;     For a shift count of one, use "add".
+ ;;     For a shift count of two or three, use "sal"/"lea" for Pentium and
+ ;;     Pentium MMX depending on which pipe the insn will execute.
+ ;;     All others use "sar".
+ ;;
+ ;;   srcreg != dstreg, constant shift count:
+ ;;
+ ;;     For shift counts of one to three, use "lea".
+ ;;     All others use "lea" for the first shift into the destination reg,
+ ;;     then fall back on the srcreg == dstreg for the residual shifts.
+ ;;
+ ;;   memory destinations or nonconstant shift count:
+ ;;
+ ;;     Use "sal".
+ ;;
+ (define_insn ""
+   [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
+ 	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r")
+ 		   (match_operand:SI 2 "nonmemory_operand" "cI,I")))]
+   "! optimize_size
+    && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
+        || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)"
+   "*
+ {
+   /* This should be extremely rare (impossible?).  We can not encode a shift
+      of the stack pointer using an lea instruction.  So copy the stack pointer
+      into the destination register and fall into the srcreg == dstreg shifting
+      support.  */
+   if (operands[1] == stack_pointer_rtx)
+     {
+       output_asm_insn (AS2 (mov%L0,%1,%0), operands);
+       operands[1] = operands[0];
+     }
+ 
+   /* Handle case where srcreg != dstreg.  */
+   if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))
+     {
+       /* For counts > 3, it is easiest to split into component insns.  */
+       if (INTVAL (operands[2]) > 3)
+ 	return \"#\";
+     
+       /* For shifts up to and including 3 bits, use lea.  */
+       operands[1] = gen_rtx_MULT (SImode, operands[1],
+ 				  GEN_INT (1 << INTVAL (operands[2])));
+       return AS2 (lea%L0,%a1,%0);
+     }
+ 
+   /* Source and destination match.  */
+ 
+   /* Handle variable shift.  */
+   if (REG_P (operands[2]))
+     return AS2 (sal%L0,%b2,%0);
+ 
+   /* Always perform shift by 1 using an add instruction.  */
+   if (REG_P (operands[0]) && operands[2] == const1_rtx)
+     return AS2 (add%L0,%0,%0);
+ 
+ #if 0
+   /* ??? Currently disabled.  reg-stack currently stomps on the mode of
+      each insn.  Thus, we can not easily detect when we should use lea to
+      improve issue characteristics.  Until reg-stack is fixed, fall back to
+      sal instruction for Pentiums to avoid AGI stall.  */
+   /* Shift reg by 2 or 3 use an lea instruction for Pentium if this is
+      insn is expected to issue into the V pipe (the insn's mode will be
+      TImode for a U pipe, and !TImode for a V pipe instruction).  */
+   if (REG_P (operands[0])
+       && GET_CODE (operands[2]) == CONST_INT
+       && INTVAL (operands[2]) <= 3
+       && (int)ix86_cpu == (int)PROCESSOR_PENTIUM
+       && GET_MODE (insn) != TImode)
+     {
+       operands[1] = gen_rtx_MULT (SImode, operands[1],
+ 				  GEN_INT (1 << INTVAL (operands[2])));
+       return AS2 (lea%L0,%a1,%0);
+     }
+ #endif
+ 
+   /* Otherwise use a shift instruction.  */
+   return AS2 (sal%L0,%2,%0);
+ }")
+ 
+ ;; Pentium/PPro/PII Splitter used when srcreg != destreg and shift
+ ;; count is > 3.  In each case we use lea to perform the first three
+ ;; shifts into the destination register, then we fall back to the
+ ;; normal shifting code for the residual shifts.
+ (define_split
+   [(set (match_operand:SI 0 "register_operand" "=r")
+ 	(ashift:SI (match_operand:SI 1 "register_operand" "r")
+ 		   (match_operand:SI 2 "immediate_operand" "I")))]
+   "reload_completed
+    && ! optimize_size
+    && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
+        || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)
+    && GET_CODE (operands[2]) == CONST_INT
+    && INTVAL (operands[2]) > 3
+    && true_regnum (operands[0]) != true_regnum (operands[1])"
+   [(set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2)))
+    (set (match_dup 0) (ashift:SI (match_dup 0) (match_dup 3)))]
+   "
+ {
+   operands[3] = GEN_INT (INTVAL (operands[2] - 3));
+   operands[2] = GEN_INT (3);
+ }")
+ 
+ 
+ ;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg"
+ ;; On i486, movl/sall appears slightly faster than leal, but the leal
+ ;; is smaller - use leal for now unless the shift count is 1.
+ ;;
  (define_insn ""
!   [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
! 	(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r")
! 		   (match_operand:SI 2 "nonmemory_operand" "cI,M")))]
!   "! optimize_size
!    && ! ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
!          || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)"
    "*
  {
    if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]