This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug target/32414] New: [4.1/4.2 Regression] Poor code for inner loop on i386
- From: "jakub at gcc dot gnu dot org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 20 Jun 2007 09:20:48 -0000
- Subject: [Bug target/32414] New: [4.1/4.2 Regression] Poor code for inner loop on i386
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
/* { dg-do compile } */
/* { dg-options "-O2 -m32 -mtune=generic" } */
typedef unsigned short int uint16_t;
typedef unsigned int uint32_t;
extern int get_src_stride(void);
extern int get_dst_stride(void);
void
foo (uint32_t *pSrc, uint32_t *pDst, uint16_t width, uint16_t height)
{
uint32_t *dstLine;
register uint32_t *dst;
uint32_t *srcLine;
register uint32_t *src;
int dstStride, srcStride;
uint16_t w;
srcStride = get_src_stride ();
dstStride = get_dst_stride ();
dstLine = pDst;
srcLine = pSrc;
while (height--)
{
dst = dstLine;
dstLine += dstStride;
src = srcLine;
srcLine += srcStride;
w = width;
while (w--)
*dst++ = *src++ | 0xFF000000;
}
}
generates extremely poor code for the inner loop in 4.1 and 4.2:
.L6:
movl -16(%ebp), %eax # src,
subw $1, -34(%ebp) #, w
addl $4, -16(%ebp) #, src
movl (%eax), %ecx #,
movl -20(%ebp), %eax # dst,
orl $-16777216, %ecx #,
movl %ecx, (%eax) #,
addl $4, %eax #,
cmpw $-1, -34(%ebp) #, w
movl %eax, -20(%ebp) #, dst
je .L4 #,
jmp .L6 #
I believe this has been introduced by the
http://gcc.gnu.org/ml/gcc-patches/2005-07/msg02021.html
patch and fixed by
http://gcc.gnu.org/ml/gcc-patches/2007-01/msg02095.html
on the trunk. The generated loop isn't perfect on the trunk:
.L4:
movl (%ebx), %eax #* src, tmp82
addl $4, %ebx #, src
subw $1, -14(%ebp) #, w
orl $-16777216, %eax #, tmp82
movl %eax, (%edi) # tmp82,* dst
addl $4, %edi #, dst
cmpw $-1, -14(%ebp) #, w
je .L3 #,
jmp .L4 #
but still far better than what 4.1 and 4.2 generate.
Slightly modified:
typedef unsigned short int uint16_t;
typedef unsigned int uint32_t;
extern int get_src_stride(void);
extern int get_dst_stride(void);
void
foo (uint32_t *pSrc, uint32_t *pDst, uint16_t width, uint16_t height)
{
uint32_t *dstLine;
register uint32_t *dst;
uint32_t *srcLine;
register uint32_t *src;
int dstStride, srcStride;
uint32_t w;
srcStride = get_src_stride ();
dstStride = get_dst_stride ();
dstLine = pDst;
srcLine = pSrc;
while (height--)
{
dst = dstLine;
dstLine += dstStride;
src = srcLine;
srcLine += srcStride;
for (w = 0; w < width; w++)
dst[w] = src[w] | 0xFF000000;
}
}
generates more compact code:
.L4:
movl (%edx,%ecx,4), %eax #* srcLine, tmp79
orl $-16777216, %eax #, tmp79
movl %eax, (%ebx,%ecx,4) # tmp79,* dstLine
addl $1, %ecx #, w
cmpl %esi, %ecx # width, w
jae .L3 #,
jmp .L4 #
--
Summary: [4.1/4.2 Regression] Poor code for inner loop on i386
Product: gcc
Version: 4.1.2
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: jakub at gcc dot gnu dot org
GCC target triplet: i386-linux
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=32414