This is the mail archive of the egcs@egcs.cygnus.com mailing list for the EGCS project. See the EGCS home page for more information.
I've been looking at a strength reduction bug for the last week or
so. Until very recently we generated incorrect code when Joern's giv
combiner was active for this case. Now the code is correct, but
decidedly suboptimal due to increased register pressure.
Here's the source:
typedef unsigned long size_t;
char *
copy(s1, s2, n)
char *s1;
const char *s2;
size_t n;
{
char c;
char *s = s1;
size_t n4 = n >> 2;
--s;
do
{
c = *s2++;
*++s1 = c;
if(c == '\0') break;
c = *s2++;
*++s1 = c;
if(c == '\0') break;
c = *s2++;
*++s1 = c;
if(c == '\0') break;
c = *s2++;
*++s1 = c;
if(c == '\0') break;
}
while(--n4);
return s;
}
Without Joern's stuff, we get this assembly for the loop:
.L5:
decl %ebx
jz .L4
.L3:
movb (%eax),%dl
incl %eax
incl %ecx
movb %dl,(%ecx)
testb %dl,%dl
je .L4
movb (%eax),%dl
incl %eax
incl %ecx
movb %dl,(%ecx)
testb %dl,%dl
je .L4
movb (%eax),%dl
incl %eax
incl %ecx
movb %dl,(%ecx)
testb %dl,%dl
je .L4
movb (%eax),%dl
incl %eax
incl %ecx
movb %dl,(%ecx)
testb %dl,%dl
jne .L5
.L4:
This code is fine. It may in fact be optimal for this case, due to
pipelining; I'm not enough of an x86 expert to say. If I were to bang
on it by hand, I'd produce something like this:
.L5:
decl %ebx
jz .L4
addl $4,%eax
addl $4,%ecx
.L3:
movb (%eax),%dl
movb %dl,1(%ecx)
testb %dl,%dl
je .L4
movb 1(%eax),%dl
movb %dl,2(%ecx)
testb %dl,%dl
je .L4
movb 2(%eax),%dl
movb %dl,3(%ecx)
testb %dl,%dl
je .L4
movb 3(%eax),%dl
movb %dl,4(%ecx)
testb %dl,%dl
jne .L5
.L4:
which might be worse due to pipeline stalls, but has the same register
demands and roughly the same code size.
With Joern's code, we get instead:
.L5:
decl -8(%ebp)
jz .L4
.L3:
movb -3(%edi),%dl
movb %dl,(%eax)
testb %dl,%dl
je .L4
movb (%ebx),%dl
leal 1(%eax),%esi
movb %dl,1(%eax)
testb %dl,%dl
je .L4
movl -12(%ebp),%ebx
movb (%ebx),%dl
movb %dl,2(%eax)
testb %dl,%dl
je .L4
movb (%edi),%dl
addl $4,%ebx
movl %ebx,-12(%ebp)
addl $4,%ecx
leal 3(%ecx),%edi
leal 1(%ecx),%ebx
addl $4,%eax
movb %dl,-2(%esi)
testb %dl,%dl
jne .L5
.L4:
Register pressure has forced things onto the stack. We have many more
address generations. We have pointer recalculations inside the loop.
We have significantly increased code size.
Looking at the loop dump, the difference seems to be that Joern's code
converts the incremented pointers to their own givs, and then
preserves those givs at the expense of the original bivs. We should
prefer to hang on to the original bivs and express things in terms of
them, at least in this context.
zw