i386 ashlsi3 improvements
Jeffrey A Law
law@hurl.cygnus.com
Mon Mar 8 23:33:00 GMT 1999
These are some minor improvements to the ashlsi3 support for the Pentium
and PPro processors. These changes were discussed on egcs-patches back in
Feb. I've updated them to account for some of the feedback I've received.
Until reg-stack is fixed, we can't determine when lea is profitable on the
Pentium from an issue standpoint, so that particular optimization is not
yet enabled.
* i386.md (ashlsi3): Revise comments. Provide new anonymous
pattern for Pentium and PPro/PII. Reverse constraints in
generic ashlsi3 anonymous pattern.
Index: i386.md
===================================================================
RCS file: /egcs/carton/cvsfiles/egcs/gcc/config/i386/i386.md,v
retrieving revision 1.50
diff -c -3 -p -r1.50 i386.md
*** i386.md 1999/02/23 10:22:57 1.50
--- i386.md 1999/03/09 07:30:25
*************** byte_xor_operation:
*** 4709,4718 ****
RET;
}")
- ;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg"
- ;; On i486, movl/sall appears slightly faster than leal, but the leal
- ;; is smaller - use leal for now unless the shift count is 1.
-
(define_expand "ashlsi3"
[(set (match_operand:SI 0 "nonimmediate_operand" "")
(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "")
--- 4709,4714 ----
*************** byte_xor_operation:
*** 4720,4738 ****
""
"")
! ;; For register destinations:
! ;; add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes
;;
! ;; lea loses when optimizing for size
;;
! ;; Do the math. If the count is 1, using add, else using sal will
! ;; produce the smallest possible code, even when the source and
! ;; dest do not match. For a memory destination, sal is the only
! ;; choice.
! ;;
! ;; Do not try to handle case where src and dest do not match. Let regmove
! ;; and reload handle them. A mov followed by this insn will generate the
! ;; desired size optimized results.
(define_insn ""
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm")
(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0")
--- 4716,4735 ----
""
"")
! ;; Optimizing for code size:
! ;; For regsiter destinations:
! ;; add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes
! ;;
! ;; lea loses when optimizing for size
;;
! ;; Do the math. If the count is 1, using add, else using sal will
! ;; produce the smallest possible code, even when the source and
! ;; dest do not match. For a memory destination, sal is the only
! ;; choice.
;;
! ;; Do not try to handle case where src and dest do not match. Let regmove
! ;; and reload handle them. A mov followed by this insn will generate the
! ;; desired size optimized results.
(define_insn ""
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm")
(ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0")
*************** byte_xor_operation:
*** 4748,4758 ****
return AS2 (sal%L0,%2,%0);
}")
(define_insn ""
! [(set (match_operand:SI 0 "nonimmediate_operand" "=r,rm")
! (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "r,0")
! (match_operand:SI 2 "nonmemory_operand" "M,cI")))]
! "! optimize_size"
"*
{
if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))
--- 4745,4888 ----
return AS2 (sal%L0,%2,%0);
}")
+ ;; For Pentium/Pentium MMX:
+ ;;
+ ;; We want to optimize for pairability, but avoid generating AGI stalls.
+ ;;
+ ;; If this insn is expected to issue in the U pipe, then prefer sal,
+ ;; else prefer lea for small shifts when srcreg == dstreg.
+ ;;
+ ;; For PPro/PII
+ ;;
+ ;; There's more than one approach to optimizing for this family; it is
+ ;; unclear which approach is best. For now, we will try to minimize
+ ;; uops. Note that sal and lea have the same characteristics, so we
+ ;; prefer sal as it takes less space.
+ ;;
+ ;; We can actually share code for these two cases since the basic techniques
+ ;; for generating good code on these chips is the same, even if the final
+ ;; code sequences are different.
+ ;;
+ ;; I do not know what is most appropriate for the AMD or Cyrix chips.
+ ;;
+ ;; srcreg == dstreg, constant shift count:
+ ;;
+ ;; For a shift count of one, use "add".
+ ;; For a shift count of two or three, use "sal"/"lea" for Pentium and
+ ;; Pentium MMX depending on which pipe the insn will execute.
+ ;; All others use "sar".
+ ;;
+ ;; srcreg != dstreg, constant shift count:
+ ;;
+ ;; For shift counts of one to three, use "lea".
+ ;; All others use "lea" for the first shift into the destination reg,
+ ;; then fall back on the srcreg == dstreg for the residual shifts.
+ ;;
+ ;; memory destinations or nonconstant shift count:
+ ;;
+ ;; Use "sal".
+ ;;
+ (define_insn ""
+ [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
+ (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r")
+ (match_operand:SI 2 "nonmemory_operand" "cI,I")))]
+ "! optimize_size
+ && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
+ || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)"
+ "*
+ {
+ /* This should be extremely rare (impossible?). We can not encode a shift
+ of the stack pointer using an lea instruction. So copy the stack pointer
+ into the destination register and fall into the srcreg == dstreg shifting
+ support. */
+ if (operands[1] == stack_pointer_rtx)
+ {
+ output_asm_insn (AS2 (mov%L0,%1,%0), operands);
+ operands[1] = operands[0];
+ }
+
+ /* Handle case where srcreg != dstreg. */
+ if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))
+ {
+ /* For counts > 3, it is easiest to split into component insns. */
+ if (INTVAL (operands[2]) > 3)
+ return \"#\";
+
+ /* For shifts up to and including 3 bits, use lea. */
+ operands[1] = gen_rtx_MULT (SImode, operands[1],
+ GEN_INT (1 << INTVAL (operands[2])));
+ return AS2 (lea%L0,%a1,%0);
+ }
+
+ /* Source and destination match. */
+
+ /* Handle variable shift. */
+ if (REG_P (operands[2]))
+ return AS2 (sal%L0,%b2,%0);
+
+ /* Always perform shift by 1 using an add instruction. */
+ if (REG_P (operands[0]) && operands[2] == const1_rtx)
+ return AS2 (add%L0,%0,%0);
+
+ #if 0
+ /* ??? Currently disabled. reg-stack currently stomps on the mode of
+ each insn. Thus, we can not easily detect when we should use lea to
+ improve issue characteristics. Until reg-stack is fixed, fall back to
+ sal instruction for Pentiums to avoid AGI stall. */
+ /* Shift reg by 2 or 3 use an lea instruction for Pentium if this is
+ insn is expected to issue into the V pipe (the insn's mode will be
+ TImode for a U pipe, and !TImode for a V pipe instruction). */
+ if (REG_P (operands[0])
+ && GET_CODE (operands[2]) == CONST_INT
+ && INTVAL (operands[2]) <= 3
+ && (int)ix86_cpu == (int)PROCESSOR_PENTIUM
+ && GET_MODE (insn) != TImode)
+ {
+ operands[1] = gen_rtx_MULT (SImode, operands[1],
+ GEN_INT (1 << INTVAL (operands[2])));
+ return AS2 (lea%L0,%a1,%0);
+ }
+ #endif
+
+ /* Otherwise use a shift instruction. */
+ return AS2 (sal%L0,%2,%0);
+ }")
+
+ ;; Pentium/PPro/PII Splitter used when srcreg != destreg and shift
+ ;; count is > 3. In each case we use lea to perform the first three
+ ;; shifts into the destination register, then we fall back to the
+ ;; normal shifting code for the residual shifts.
+ (define_split
+ [(set (match_operand:SI 0 "register_operand" "=r")
+ (ashift:SI (match_operand:SI 1 "register_operand" "r")
+ (match_operand:SI 2 "immediate_operand" "I")))]
+ "reload_completed
+ && ! optimize_size
+ && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
+ || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)
+ && GET_CODE (operands[2]) == CONST_INT
+ && INTVAL (operands[2]) > 3
+ && true_regnum (operands[0]) != true_regnum (operands[1])"
+ [(set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2)))
+ (set (match_dup 0) (ashift:SI (match_dup 0) (match_dup 3)))]
+ "
+ {
+ operands[3] = GEN_INT (INTVAL (operands[2] - 3));
+ operands[2] = GEN_INT (3);
+ }")
+
+
+ ;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg"
+ ;; On i486, movl/sall appears slightly faster than leal, but the leal
+ ;; is smaller - use leal for now unless the shift count is 1.
+ ;;
(define_insn ""
! [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
! (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r")
! (match_operand:SI 2 "nonmemory_operand" "cI,M")))]
! "! optimize_size
! && ! ((int)ix86_cpu == (int)PROCESSOR_PENTIUM
! || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)"
"*
{
if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1]))
More information about the Gcc-patches
mailing list