This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Re: Performance of Integer Multiplication on PIII


> On Sat, 3 Nov 2001, Jan Hubicka wrote:
> 
> > > I am confused myself as to which architecture is meant by -march=i686,
> > > but I didn't think it was P-III.  I thought -march=pentiumpro was the one
> > > for P-II/P-III.  However, wouldn't many people wonder whether you mean P4
> > > when you talk about "faster Pentiums?" Some of us would not go out of our
> > > way to slow down the P4 by asking for imul on code meant to run on a
> > > variety of CPU's, although some would like to have -Os favor the use of
> > > imul.  It might be interesting to have a separate table of the penalties
> > > used in each version of gcc for each architecture selection, rather than
> > > having to peruse source.
> > Gcc do have set of costs for each supported x86 variant and for size
> > optimization separately.  The model is an estimate (it does not handle leas and
> > 2 address machines perfectly), as the algorithm to construct sequences is not
> > trying all possibilities, so it is possible that the produced sequences are
> > suboptimal, but in case it makes truly big differences, we probably should
> > investigate it.
> 
> Yes I agree and this is one of the cases.

OK, I did so and found following problems:
1) RTX_COST do use lea cost when lea is possible assuming that lea
   is cheaper, that is not the case for Athlon
2) PPro/P4 decompose lea to primitive operations, so for RTX_COST purposes
   we can just hide her existence.
   (for instance lea doing shift+multiply together does not take one cycle,
    but two).
3) expand_mult precompute shift/add costs for register operand, but can
   use memory operand for multiply cost resulting in comparing apples to
   oranges.

The attached patch should fix all three problems.  Your testcase still
does use some unwound multiplies, but runs faster on celeron machines here
in lab than the assembly one you supplied.

Assembly now is:

.globl read
	.type	read,@function
read:
	subl	$8, %esp
	movl	%ebx, (%esp)
	movl	12(%esp), %ebx
	movl	%esi, 4(%esp)
	movl	20(%ebx), %eax
	movl	24(%ebx), %edx
	movl	12(%ebx), %esi
	imull	$14406, %eax, %eax
	imull	$86436, %edx, %edx
	imull	$343, %esi, %esi
	movl	4(%ebx), %ecx
	addl	%edx, %eax
	movl	16(%ebx), %edx
	imull	$2401, %edx, %edx
	addl	%edx, %esi
	leal	0(,%ecx,8), %edx
	subl	%ecx, %edx
	imull	$49, 8(%ebx), %ecx
	addl	%ecx, %edx
	movl	(%ebx), %ecx
	addl	%ecx, %edx
	addl	%edx, %esi
	movl	28(%ebx), %edx
	addl	%esi, %eax
	movl	(%esp), %ebx
	imull	$518616, %edx, %edx
	movl	4(%esp), %esi
	addl	$8, %esp
	addl	%edx, %eax
	ret

Mon Nov  5 15:42:22 CET 2001  Jan Hubicka  <jh@suse.cz>

	* expmed.c (expand_mult): Force operand to register before computing
	cost.
	* i386.c (x86_decompose_lea): New global vairable.
	* i386.h (x86_decompose_lea): Declare.
	(TARGET_DECOMPOSE_LEA): New macro.
	(RTX_COST): Handle leas properly.

Index: expmed.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/expmed.c,v
retrieving revision 1.93
diff -c -3 -p -r1.93 expmed.c
*** expmed.c	2001/11/02 22:07:29	1.93
--- expmed.c	2001/11/05 14:36:38
*************** expand_mult (mode, op0, op1, target, uns
*** 2406,2411 ****
--- 2406,2415 ----
        int mult_cost;
        enum {basic_variant, negate_variant, add_variant} variant = basic_variant;
  
+       /* op0 must be register to make mult_cost match the precomputed shiftadd_cost
+          array.  */
+       op0 = force_reg (mode, op0);
+ 
        /* Try to do the computation three ways: multiply by the negative of OP1
  	 and then negate, do the multiplication directly, or do multiplication
  	 by OP1 - 1.  */
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.c,v
retrieving revision 1.330
diff -c -3 -p -r1.330 i386.c
*** i386.c	2001/10/31 14:32:27	1.330
--- i386.c	2001/11/05 14:38:46
*************** const int x86_memory_mismatch_stall = m_
*** 375,380 ****
--- 375,381 ----
  const int x86_accumulate_outgoing_args = m_ATHLON | m_PENT4 | m_PPRO;
  const int x86_prologue_using_move = m_ATHLON | m_PENT4 | m_PPRO;
  const int x86_epilogue_using_move = m_ATHLON | m_PENT4 | m_PPRO;
+ const int x86_decompose_lea = m_PPRO | m_PENT4;
  
  /* In case the avreage insn count for single function invocation is
     lower than this constant, emit fast (but longer) prologue and
Index: config/i386/i386.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.h,v
retrieving revision 1.214
diff -c -3 -p -r1.214 i386.h
*** i386.h	2001/10/31 14:32:28	1.214
--- i386.h	2001/11/05 14:39:20
*************** extern const int x86_promote_hi_regs, x8
*** 218,224 ****
  extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
  extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
  extern const int x86_accumulate_outgoing_args, x86_prologue_using_move;
! extern const int x86_epilogue_using_move;
  
  #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
  #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
--- 218,224 ----
  extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
  extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
  extern const int x86_accumulate_outgoing_args, x86_prologue_using_move;
! extern const int x86_epilogue_using_move, x86_decompose_lea;
  
  #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
  #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
*************** extern const int x86_epilogue_using_move
*** 256,261 ****
--- 256,262 ----
  #define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK)
  #define TARGET_PROLOGUE_USING_MOVE (x86_prologue_using_move & CPUMASK)
  #define TARGET_EPILOGUE_USING_MOVE (x86_epilogue_using_move & CPUMASK)
+ #define TARGET_DECOMPOSE_LEA (x86_decompose_lea & CPUMASK)
  
  #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
  
*************** while (0)
*** 2485,2491 ****
  	HOST_WIDE_INT value = INTVAL (XEXP (X, 1));			\
  	if (value == 1)							\
  	  TOPLEVEL_COSTS_N_INSNS (ix86_cost->add);			\
! 	if (value == 2 || value == 3)					\
  	  TOPLEVEL_COSTS_N_INSNS (ix86_cost->lea);			\
        }									\
      /* fall through */							\
--- 2486,2494 ----
  	HOST_WIDE_INT value = INTVAL (XEXP (X, 1));			\
  	if (value == 1)							\
  	  TOPLEVEL_COSTS_N_INSNS (ix86_cost->add);			\
! 	if ((value == 2 || value == 3)					\
! 	    && !TARGET_DECOMPOSE_LEA					\
! 	    && ix86_cost->lea <= ix86_cost->shift_const)		\
  	  TOPLEVEL_COSTS_N_INSNS (ix86_cost->lea);			\
        }									\
      /* fall through */							\
*************** while (0)
*** 2546,2582 ****
      TOPLEVEL_COSTS_N_INSNS (ix86_cost->divide);				\
  									\
    case PLUS:								\
!     if (GET_CODE (XEXP (X, 0)) == PLUS					\
! 	&& GET_CODE (XEXP (XEXP (X, 0), 0)) == MULT			\
! 	&& GET_CODE (XEXP (XEXP (XEXP (X, 0), 0), 1)) == CONST_INT	\
! 	&& GET_CODE (XEXP (X, 1)) == CONST_INT)				\
        {									\
! 	HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (X, 0), 0), 1));	\
! 	if (val == 2 || val == 4 || val == 8)				\
  	  {								\
!             return (COSTS_N_INSNS (ix86_cost->lea)			\
! 		    + rtx_cost (XEXP (XEXP (X, 0), 1), OUTER_CODE)	\
! 		    + rtx_cost (XEXP (XEXP (XEXP (X, 0), 0), 0), OUTER_CODE) \
! 		    + rtx_cost (XEXP (X, 1), OUTER_CODE));		\
  	  }								\
!       }									\
!     else if (GET_CODE (XEXP (X, 0)) == MULT				\
! 	     && GET_CODE (XEXP (XEXP (X, 0), 1)) == CONST_INT)		\
!       {									\
! 	HOST_WIDE_INT val = INTVAL (XEXP (XEXP (X, 0), 1));		\
! 	if (val == 2 || val == 4 || val == 8)				\
  	  {								\
  	    return (COSTS_N_INSNS (ix86_cost->lea)			\
  		    + rtx_cost (XEXP (XEXP (X, 0), 0), OUTER_CODE)	\
  		    + rtx_cost (XEXP (X, 1), OUTER_CODE));		\
  	  }								\
-       }									\
-     else if (GET_CODE (XEXP (X, 0)) == PLUS)				\
-       {									\
- 	return (COSTS_N_INSNS (ix86_cost->lea)				\
- 		+ rtx_cost (XEXP (XEXP (X, 0), 0), OUTER_CODE)		\
- 		+ rtx_cost (XEXP (XEXP (X, 0), 1), OUTER_CODE)		\
- 		+ rtx_cost (XEXP (X, 1), OUTER_CODE));			\
        }									\
  									\
      /* fall through */							\
--- 2549,2590 ----
      TOPLEVEL_COSTS_N_INSNS (ix86_cost->divide);				\
  									\
    case PLUS:								\
!     if (!TARGET_DECOMPOSE_LEA						\
! 	&& INTEGRAL_MODE_P (GET_MODE (X))				\
! 	&& GET_MODE_BITSIZE (GET_MODE (X)) <= GET_MODE_BITSIZE (Pmode))	\
        {									\
!         if (GET_CODE (XEXP (X, 0)) == PLUS				\
! 	    && GET_CODE (XEXP (XEXP (X, 0), 0)) == MULT			\
! 	    && GET_CODE (XEXP (XEXP (XEXP (X, 0), 0), 1)) == CONST_INT	\
! 	    && CONSTANT_P (XEXP (X, 1)))				\
  	  {								\
! 	    HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (X, 0), 0), 1));\
! 	    if (val == 2 || val == 4 || val == 8)			\
! 	      {								\
! 		return (COSTS_N_INSNS (ix86_cost->lea)			\
! 			+ rtx_cost (XEXP (XEXP (X, 0), 1), OUTER_CODE)	\
! 			+ rtx_cost (XEXP (XEXP (XEXP (X, 0), 0), 0), OUTER_CODE) \
! 			+ rtx_cost (XEXP (X, 1), OUTER_CODE));		\
! 	      }								\
  	  }								\
! 	else if (GET_CODE (XEXP (X, 0)) == MULT				\
! 		 && GET_CODE (XEXP (XEXP (X, 0), 1)) == CONST_INT)	\
  	  {								\
+ 	    HOST_WIDE_INT val = INTVAL (XEXP (XEXP (X, 0), 1));		\
+ 	    if (val == 2 || val == 4 || val == 8)			\
+ 	      {								\
+ 		return (COSTS_N_INSNS (ix86_cost->lea)			\
+ 			+ rtx_cost (XEXP (XEXP (X, 0), 0), OUTER_CODE)	\
+ 			+ rtx_cost (XEXP (X, 1), OUTER_CODE));		\
+ 	      }								\
+ 	  }								\
+ 	else if (GET_CODE (XEXP (X, 0)) == PLUS)			\
+ 	  {								\
  	    return (COSTS_N_INSNS (ix86_cost->lea)			\
  		    + rtx_cost (XEXP (XEXP (X, 0), 0), OUTER_CODE)	\
+ 		    + rtx_cost (XEXP (XEXP (X, 0), 1), OUTER_CODE)	\
  		    + rtx_cost (XEXP (X, 1), OUTER_CODE));		\
  	  }								\
        }									\
  									\
      /* fall through */							\


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]