This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM
- From: "xuelingko at yahoo dot com.tw" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Fri, 17 Jan 2014 11:26:44 +0000
- Subject: [Bug rtl-optimization/59857] New: 4.8.2 loop optimization is worse than 4.5.1 under ARM
- Auto-submitted: auto-generated
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59857
Bug ID: 59857
Summary: 4.8.2 loop optimization is worse than 4.5.1 under ARM
Product: gcc
Version: 4.8.2
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: xuelingko at yahoo dot com.tw
I compile a simple source code, memread.c, by gcc 4.8.2 and 4.5.1
The C code is:
int TEST_Memread(ulv * pSrc, unsigned int nCount)
{
unsigned int val;
ulv *p1 = NULL;
unsigned int i;
p1 = (ulv *) pSrc;
for (i = 0; i < nCount; i++)
val = *p1++;
return 10;
}
# gcc -Wall -O2 -static -g -gstabs+ -c memread.c
4.8.2:
Target: armv7a
Configured with: ../gcc-4.8.2/configure
--prefix=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/gcc
--host=x86_64-pc-linux-gnu --build=x86_64-pc-linux-gnu
--target=armv7a-mediatek-linux-gnueabi
--with-sysroot=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/gcc/sysroot
--with-arch=armv7-a --with-tune=cortex-a7 --with-cpu=cortex-a7 --with-interwork
--with-fpu=vfpv4-d16 --with-float=softfp --with-gnu-as --with-gnu-ld
--disable-nls --enable-shared --enable-__cxa_atexit --disable-multilib
--enable-c99 --enable-long-long --enable-threads=posix --enable-languages=c,c++
--with-gmp=/tmp/root/build/x86_64 --with-mpfr=/tmp/root/build/x86_64
--with-cloog=/tmp/root/build/x86_64 --with-isl=/tmp/root/build/x86_64
--with-libelf=/tmp/root/build/x86_64
--program-transform-name='s,^,armv7a_001_vfp-linux-gnueabi-,'
--with-mpc=/tmp/root/build/x86_64 --enable-lto --without-system-libunwind
--disable-rpath --with-host-libstdcxx='-static-libgcc
-Wl,-Bstatic,-lstdc++,-Bdynamic,-lm'
--with-specs='%{!fno-unwind-tables:-funwind-tables}'
--with-build-time-tools=/tmp/root/usr/toolchain-4.8.2-vfp/cortex-a7/binutils/armv7a/bin
--enable-cxx-flags='-g -O2'
Thread model: posix
gcc version 4.8.2 20131014 (prerelease) (Linaro GCC 4.8-2013.10)
Target: armv7a
Configured with: ../gcc-4.5.1/configure
--prefix=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/gcc
--host=i686-pc-linux-gnu --target=armv7a
--with-sysroot=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/gcc/sysroot
--with-arch=armv7-a --with-tune=cortex-a9 --with-cpu=cortex-a9 --with-interwork
--with-fpu=vfp --with-float=softfp --with-gnu-as --with-gnu-ld --disable-nls
--enable-shared --enable-__cxa_atexit --disable-multilib --enable-c99
--enable-long-long --enable-threads=posix --enable-languages=c,c++
--with-gmp=/tmp/root/build/i686 --with-mpfr=/tmp/root/build/i686
--with-ppl=/tmp/root/build/i686 --with-cloog=/tmp/root/build/i686
--with-libelf=/tmp/root/build/i686 --program-transform-name='s,^,armv7a-,'
--with-mpc=/tmp/root/build/i686 --enable-lto --without-system-libunwind
--disable-rpath --with-host-libstdcxx='-static-libgcc
-Wl,-Bstatic,-lstdc++,-Bdynamic,-lm'
--with-specs='%{!fno-unwind-tables:-funwind-tables}'
--with-build-time-tools=/tmp/root/usr/toolchain-4.5.1-vfp/cortex-a9/binutils/armv7a/bin/
--enable-cxx-flags='-g -O2'
Thread model: posix
gcc version 4.5.1 (GCC)
The objdump of 4.8.2 is
Disassembly of section .text:
00000000 <TEST_Memread>:
unsigned int val;
ulv *p1 = NULL;
unsigned int i;
p1 = (ulv *) pSrc;
for (i = 0; i < nCount; i++)
0: e3510000 cmp r1, #0
4: 0a000005 beq 20 <TEST_Memread+0x20>
8: e3a03000 mov r3, #0
val = *p1++;
c: e5902000 ldr r2, [r0]
unsigned int val;
ulv *p1 = NULL;
unsigned int i;
p1 = (ulv *) pSrc;
for (i = 0; i < nCount; i++)
10: e2833001 add r3, r3, #1
14: e1530001 cmp r3, r1
val = *p1++;
18: e2800004 add r0, r0, #4
unsigned int val;
ulv *p1 = NULL;
unsigned int i;
p1 = (ulv *) pSrc;
for (i = 0; i < nCount; i++)
1c: 1afffffa bne c <TEST_Memread+0xc>
val = *p1++;
return 10;
}
20: e3a0000a mov r0, #10
24: e12fff1e bx lr
The objdump of 4.5.1 is
Disassembly of section .text:
00000000 <TEST_Memread>:
unsigned int val;
ulv *p1 = NULL;
unsigned int i;
p1 = (ulv *) pSrc;
for (i = 0; i < nCount; i++)
0: e3510000 cmp r1, #0
4: 0a000004 beq 1c <TEST_Memread+0x1c>
8: e3a03000 mov r3, #0
c: e2833001 add r3, r3, #1
val = *p1++;
10: e4902004 ldr r2, [r0], #4
unsigned int val;
ulv *p1 = NULL;
unsigned int i;
p1 = (ulv *) pSrc;
for (i = 0; i < nCount; i++)
14: e1510003 cmp r1, r3
18: 8afffffb bhi c <TEST_Memread+0xc>
val = *p1++;
return 10;
}
1c: e3a0000a mov r0, #10
20: e12fff1e bx lr
The main different between them is
4.8.2:
c: e5902000 ldr r2, [r0]
18: e2800004 add r0, r0, #4
4.5.1
10: e4902004 ldr r2, [r0], #4
For this loop performance example, 4.8.2 is only 80% of 4.5.1, this make the
memory read result is bad when using 4.8.2.