This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: Performance of Integer Multiplication on PIII
> On Sat, 3 Nov 2001, Jan Hubicka wrote:
>
> > > I am confused myself as to which architecture is meant by -march=i686,
> > > but I didn't think it was P-III. I thought -march=pentiumpro was the one
> > > for P-II/P-III. However, wouldn't many people wonder whether you mean P4
> > > when you talk about "faster Pentiums?" Some of us would not go out of our
> > > way to slow down the P4 by asking for imul on code meant to run on a
> > > variety of CPU's, although some would like to have -Os favor the use of
> > > imul. It might be interesting to have a separate table of the penalties
> > > used in each version of gcc for each architecture selection, rather than
> > > having to peruse source.
> > Gcc do have set of costs for each supported x86 variant and for size
> > optimization separately. The model is an estimate (it does not handle leas and
> > 2 address machines perfectly), as the algorithm to construct sequences is not
> > trying all possibilities, so it is possible that the produced sequences are
> > suboptimal, but in case it makes truly big differences, we probably should
> > investigate it.
>
> Yes I agree and this is one of the cases.
OK, I did so and found following problems:
1) RTX_COST do use lea cost when lea is possible assuming that lea
is cheaper, that is not the case for Athlon
2) PPro/P4 decompose lea to primitive operations, so for RTX_COST purposes
we can just hide her existence.
(for instance lea doing shift+multiply together does not take one cycle,
but two).
3) expand_mult precompute shift/add costs for register operand, but can
use memory operand for multiply cost resulting in comparing apples to
oranges.
The attached patch should fix all three problems. Your testcase still
does use some unwound multiplies, but runs faster on celeron machines here
in lab than the assembly one you supplied.
Assembly now is:
.globl read
.type read,@function
read:
subl $8, %esp
movl %ebx, (%esp)
movl 12(%esp), %ebx
movl %esi, 4(%esp)
movl 20(%ebx), %eax
movl 24(%ebx), %edx
movl 12(%ebx), %esi
imull $14406, %eax, %eax
imull $86436, %edx, %edx
imull $343, %esi, %esi
movl 4(%ebx), %ecx
addl %edx, %eax
movl 16(%ebx), %edx
imull $2401, %edx, %edx
addl %edx, %esi
leal 0(,%ecx,8), %edx
subl %ecx, %edx
imull $49, 8(%ebx), %ecx
addl %ecx, %edx
movl (%ebx), %ecx
addl %ecx, %edx
addl %edx, %esi
movl 28(%ebx), %edx
addl %esi, %eax
movl (%esp), %ebx
imull $518616, %edx, %edx
movl 4(%esp), %esi
addl $8, %esp
addl %edx, %eax
ret
Mon Nov 5 15:42:22 CET 2001 Jan Hubicka <jh@suse.cz>
* expmed.c (expand_mult): Force operand to register before computing
cost.
* i386.c (x86_decompose_lea): New global vairable.
* i386.h (x86_decompose_lea): Declare.
(TARGET_DECOMPOSE_LEA): New macro.
(RTX_COST): Handle leas properly.
Index: expmed.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/expmed.c,v
retrieving revision 1.93
diff -c -3 -p -r1.93 expmed.c
*** expmed.c 2001/11/02 22:07:29 1.93
--- expmed.c 2001/11/05 14:36:38
*************** expand_mult (mode, op0, op1, target, uns
*** 2406,2411 ****
--- 2406,2415 ----
int mult_cost;
enum {basic_variant, negate_variant, add_variant} variant = basic_variant;
+ /* op0 must be register to make mult_cost match the precomputed shiftadd_cost
+ array. */
+ op0 = force_reg (mode, op0);
+
/* Try to do the computation three ways: multiply by the negative of OP1
and then negate, do the multiplication directly, or do multiplication
by OP1 - 1. */
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.c,v
retrieving revision 1.330
diff -c -3 -p -r1.330 i386.c
*** i386.c 2001/10/31 14:32:27 1.330
--- i386.c 2001/11/05 14:38:46
*************** const int x86_memory_mismatch_stall = m_
*** 375,380 ****
--- 375,381 ----
const int x86_accumulate_outgoing_args = m_ATHLON | m_PENT4 | m_PPRO;
const int x86_prologue_using_move = m_ATHLON | m_PENT4 | m_PPRO;
const int x86_epilogue_using_move = m_ATHLON | m_PENT4 | m_PPRO;
+ const int x86_decompose_lea = m_PPRO | m_PENT4;
/* In case the avreage insn count for single function invocation is
lower than this constant, emit fast (but longer) prologue and
Index: config/i386/i386.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.h,v
retrieving revision 1.214
diff -c -3 -p -r1.214 i386.h
*** i386.h 2001/10/31 14:32:28 1.214
--- i386.h 2001/11/05 14:39:20
*************** extern const int x86_promote_hi_regs, x8
*** 218,224 ****
extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
extern const int x86_accumulate_outgoing_args, x86_prologue_using_move;
! extern const int x86_epilogue_using_move;
#define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
#define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
--- 218,224 ----
extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
extern const int x86_accumulate_outgoing_args, x86_prologue_using_move;
! extern const int x86_epilogue_using_move, x86_decompose_lea;
#define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
#define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
*************** extern const int x86_epilogue_using_move
*** 256,261 ****
--- 256,262 ----
#define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK)
#define TARGET_PROLOGUE_USING_MOVE (x86_prologue_using_move & CPUMASK)
#define TARGET_EPILOGUE_USING_MOVE (x86_epilogue_using_move & CPUMASK)
+ #define TARGET_DECOMPOSE_LEA (x86_decompose_lea & CPUMASK)
#define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
*************** while (0)
*** 2485,2491 ****
HOST_WIDE_INT value = INTVAL (XEXP (X, 1)); \
if (value == 1) \
TOPLEVEL_COSTS_N_INSNS (ix86_cost->add); \
! if (value == 2 || value == 3) \
TOPLEVEL_COSTS_N_INSNS (ix86_cost->lea); \
} \
/* fall through */ \
--- 2486,2494 ----
HOST_WIDE_INT value = INTVAL (XEXP (X, 1)); \
if (value == 1) \
TOPLEVEL_COSTS_N_INSNS (ix86_cost->add); \
! if ((value == 2 || value == 3) \
! && !TARGET_DECOMPOSE_LEA \
! && ix86_cost->lea <= ix86_cost->shift_const) \
TOPLEVEL_COSTS_N_INSNS (ix86_cost->lea); \
} \
/* fall through */ \
*************** while (0)
*** 2546,2582 ****
TOPLEVEL_COSTS_N_INSNS (ix86_cost->divide); \
\
case PLUS: \
! if (GET_CODE (XEXP (X, 0)) == PLUS \
! && GET_CODE (XEXP (XEXP (X, 0), 0)) == MULT \
! && GET_CODE (XEXP (XEXP (XEXP (X, 0), 0), 1)) == CONST_INT \
! && GET_CODE (XEXP (X, 1)) == CONST_INT) \
{ \
! HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (X, 0), 0), 1)); \
! if (val == 2 || val == 4 || val == 8) \
{ \
! return (COSTS_N_INSNS (ix86_cost->lea) \
! + rtx_cost (XEXP (XEXP (X, 0), 1), OUTER_CODE) \
! + rtx_cost (XEXP (XEXP (XEXP (X, 0), 0), 0), OUTER_CODE) \
! + rtx_cost (XEXP (X, 1), OUTER_CODE)); \
} \
! } \
! else if (GET_CODE (XEXP (X, 0)) == MULT \
! && GET_CODE (XEXP (XEXP (X, 0), 1)) == CONST_INT) \
! { \
! HOST_WIDE_INT val = INTVAL (XEXP (XEXP (X, 0), 1)); \
! if (val == 2 || val == 4 || val == 8) \
{ \
return (COSTS_N_INSNS (ix86_cost->lea) \
+ rtx_cost (XEXP (XEXP (X, 0), 0), OUTER_CODE) \
+ rtx_cost (XEXP (X, 1), OUTER_CODE)); \
} \
- } \
- else if (GET_CODE (XEXP (X, 0)) == PLUS) \
- { \
- return (COSTS_N_INSNS (ix86_cost->lea) \
- + rtx_cost (XEXP (XEXP (X, 0), 0), OUTER_CODE) \
- + rtx_cost (XEXP (XEXP (X, 0), 1), OUTER_CODE) \
- + rtx_cost (XEXP (X, 1), OUTER_CODE)); \
} \
\
/* fall through */ \
--- 2549,2590 ----
TOPLEVEL_COSTS_N_INSNS (ix86_cost->divide); \
\
case PLUS: \
! if (!TARGET_DECOMPOSE_LEA \
! && INTEGRAL_MODE_P (GET_MODE (X)) \
! && GET_MODE_BITSIZE (GET_MODE (X)) <= GET_MODE_BITSIZE (Pmode)) \
{ \
! if (GET_CODE (XEXP (X, 0)) == PLUS \
! && GET_CODE (XEXP (XEXP (X, 0), 0)) == MULT \
! && GET_CODE (XEXP (XEXP (XEXP (X, 0), 0), 1)) == CONST_INT \
! && CONSTANT_P (XEXP (X, 1))) \
{ \
! HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (X, 0), 0), 1));\
! if (val == 2 || val == 4 || val == 8) \
! { \
! return (COSTS_N_INSNS (ix86_cost->lea) \
! + rtx_cost (XEXP (XEXP (X, 0), 1), OUTER_CODE) \
! + rtx_cost (XEXP (XEXP (XEXP (X, 0), 0), 0), OUTER_CODE) \
! + rtx_cost (XEXP (X, 1), OUTER_CODE)); \
! } \
} \
! else if (GET_CODE (XEXP (X, 0)) == MULT \
! && GET_CODE (XEXP (XEXP (X, 0), 1)) == CONST_INT) \
{ \
+ HOST_WIDE_INT val = INTVAL (XEXP (XEXP (X, 0), 1)); \
+ if (val == 2 || val == 4 || val == 8) \
+ { \
+ return (COSTS_N_INSNS (ix86_cost->lea) \
+ + rtx_cost (XEXP (XEXP (X, 0), 0), OUTER_CODE) \
+ + rtx_cost (XEXP (X, 1), OUTER_CODE)); \
+ } \
+ } \
+ else if (GET_CODE (XEXP (X, 0)) == PLUS) \
+ { \
return (COSTS_N_INSNS (ix86_cost->lea) \
+ rtx_cost (XEXP (XEXP (X, 0), 0), OUTER_CODE) \
+ + rtx_cost (XEXP (XEXP (X, 0), 1), OUTER_CODE) \
+ rtx_cost (XEXP (X, 1), OUTER_CODE)); \
} \
} \
\
/* fall through */ \