This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
2.95, x86: severe performance problems with short arithmetic
- To: gcc@gcc.gnu.org, rth@cygnus.com
- Subject: 2.95, x86: severe performance problems with short arithmetic
- From: Zack Weinberg <zack@bitmover.com>
- Date: Tue, 10 Aug 1999 11:52:32 -0700
This function
unsigned short
cksum(unsigned char *buf1, unsigned char *buf2)
{
unsigned short sum = 0;
unsigned char *p, *q, c;
p = buf1;
q = buf2;
for (;;) {
c = *p++;
if (c == '\0') break;
sum += c;
*q++ = c;
if (c == '\n') break;
}
return (sum);
}
is approximately twice as slow when compiled by 2.95 as 2.7. The
entire performance penalty can be blamed on one instruction, which I
have starred below. This dump was generated by the current mainline, but
2.95 generates identical code.
.file "fpd2.c"
.version "01.01"
gcc2_compiled.:
.text
.align 4
.globl cksum
.type cksum,@function
cksum:
pushl %esi # 80 movsi-2
pushl %ebx # 81 movsi-2
movl 12(%esp),%ebx # 15 movsi+2/2
movl 16(%esp),%ecx # 18 movsi+2/2
xorl %esi,%esi # 12 movhi+1/1
.p2align 4,,7
.L3:
movb (%ebx),%dl # 26 movqi+1/1
incl %ebx # 27 addsi3+1/1
testb %dl,%dl # 29 tstqi_1
je .L4 # 30 bleu+1
movzbw %dl,%ax # 35 zero_extendqihi2+1
*** addl %eax,%esi # 37 addhi3+1/1
movb %dl,(%ecx) # 41 movqi+1/3
incl %ecx # 42 addsi3+1/1
cmpb $10,%dl # 44 cmpqi_1/2
jne .L3 # 45 bleu+1
.L4:
movzwl %si,%eax # 61 zero_extendhisi2+1
popl %ebx # 84 pop
popl %esi # 85 pop
ret # 86 return_internal
.Lfe1:
.size cksum,.Lfe1-cksum
.ident "GCC: (GNU) 2.96 19990808 (experimental)"
If I change that instruction to 'addw %ax,%si' the code is as fast as
that produced by 2.7 (and indeed 2.7 uses an addw here). The
new_ia32_branch uses an addw for this case, so the problem is fixed,
but It Would Be Really Nice if it could get fixed in 2.95.1 as well.
(I am attempting to convince my boss of the wisdom of dropping 2.7,
and a 30% performance penalty for real code isn't helping me any...)
The corresponding insn is in HImode all the way to the .stack dump:
(insn 37 35 41 (set (reg/v:HI 4 %si)
(plus:HI (reg/v:HI 4 %si)
(reg:HI 0 %ax))) 208 {addhi3+1}
(insn_list/j/c 35 (insn_list/j/c 35 (nil)))
(expr_list:REG_DEAD (reg:HI 0 %ax)
(nil)))
so I am guessing that the problem is with this piece of i386.md:
(define_insn ""
[(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r,?r")
(plus:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0,r")
(match_operand:HI 2 "general_operand" "ri,rm,ri")))]
"ix86_binary_operator_ok (PLUS, HImode, operands)"
"*
{
/* ... */
/* Use a 32-bit operation when possible, to avoid the prefix penalty. */
if (REG_P (operands[0])
&& i386_aligned_p (operands[2])
&& i386_cc_probably_useless_p (insn))
{
CC_STATUS_INIT;
if (GET_CODE (operands[2]) == CONST_INT)
{
HOST_WIDE_INT intval = 0xffff & INTVAL (operands[2]);
if (intval == 1)
return AS1 (inc%L0,%k0);
if (intval == 0xffff)
return AS1 (dec%L0,%k0);
operands[2] = i386_sext16_if_const (operands[2]);
}
return AS2 (add%L0,%k2,%k0);
}
if (operands[2] == const1_rtx)
return AS1 (inc%W0,%0);
if (operands[2] == constm1_rtx
|| (GET_CODE (operands[2]) == CONST_INT
&& INTVAL (operands[2]) == 65535))
return AS1 (dec%W0,%0);
return AS2 (add%W0,%2,%0);
}"
[(set_attr "type" "binary")])
Looks like it would suffice to rip out the entire if block - but there
was a reason it was put there in the first place, right?
zw