[Bug target/68233] New: Performance : GCC not uses possible LDP-Instruction on ARM64
gunnar.von.boehn at huawei dot com
gcc-bugzilla@gcc.gnu.org
Fri Nov 6 09:32:00 GMT 2015
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68233
Bug ID: 68233
Summary: Performance : GCC not uses possible LDP-Instruction on
ARM64
Product: gcc
Version: 4.9.2
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: gunnar.von.boehn at huawei dot com
Target Milestone: ---
Dear List,
it seems to me that GCC not fully utilizes available LDP-instruction on ARM64.
On Cortex-A57 the LDP instruction could load 2 64bit registers in 1 cycle,
while when using LDR-instructions only 1 can be loaded.
****************************************************************
I have ARM64 / Cortex-A57 System:
acc@linaro-nano:~/minibench9$ cat /proc/cpuinfo
processor : 0
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32
CPU implementer : 0x41
CPU architecture: 8
CPU variant : 0x1
CPU part : 0xd07
CPU revision : 1
****************************************************************
gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/aarch64-linux-gnu/4.9/lto-wrapper
Target: aarch64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro
4.9.2-10ubuntu13' --with-bugurl=file:///usr/share/doc/gcc-4.9/README.Bugs
--enable-languages=c,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr
--program-suffix=-4.9 --enable-shared --enable-linker-build-id
--libexecdir=/usr/lib --without-included-gettext --enable-threads=posix
--with-gxx-include-dir=/usr/include/c++/4.9 --libdir=/usr/lib --enable-nls
--with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug
--enable-libstdcxx-time=yes --enable-gnu-unique-object --disable-libsanitizer
--disable-libquadmath --enable-plugin --with-system-zlib
--disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo
--with-java-home=/usr/lib/jvm/java-1.5.0-gcj-4.9-arm64/jre --enable-java-home
--with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-4.9-arm64
--with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-4.9-arm64
--with-arch-directory=arm64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar
--enable-multiarch --disable-werror --enable-checking=release
--build=aarch64-linux-gnu --host=aarch64-linux-gnu --target=aarch64-linux-gnu
Thread model: posix
gcc version 4.9.2 (Ubuntu/Linaro 4.9.2-10ubuntu13)
****************************************************************
compile options:
gcc -O2 -S -mcpu=cortex-A57 -mtune=cortex-A57 ldp.c
****************************************************************
C Source:
acc@linaro-nano:~/minibench9$ cat ldp.c
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <limits.h>
#include <arm_neon.h>
double zgemm(double * ptrba, double * ptrbb, size_t bk){
double load0,load1,load2,load3,load4,load5;
double res0,res1,res2,res3,res4,res5,res6,res7;
for( ; bk; bk--)
{
load0 = ptrba[4*0+0];
load2 = ptrba[4*0+1];
load4 = ptrba[4*0+2];
load5 = ptrba[4*0+3];
load1 = ptrbb[4*0+0];
load3 = ptrbb[4*0+1];
res0 = res0+load0*load1;
res1 = res1+load2*load1;
res2 = res2+load4*load1;
res3 = res3+load5*load1;
res4 = res4+load0*load3;
res5 = res5+load2*load3;
res6 = res6+load4*load3;
res7 = res7+load5*load3;
ptrba += 4;
ptrbb += 2;
}
res0 += res1;
res0 += res2;
res0 += res3;
res0 += res4;
res0 += res5;
res0 += res6;
res0 += res7;
return res0;
}
***********************************************************
Created ASM code:
acc@linaro-nano:~/minibench9$ cat ldp.s
.cpu cortex-a57+fp+simd+crc
.file "ldp.c"
.text
.align 2
.global zgemm
.type zgemm, %function
zgemm:
cbz x2, .L2
.L3:
ldr d2, [x1]
subs x2, x2, #1
add x0, x0, 32
ldr d1, [x1, 8]
add x1, x1, 16
ldr d7, [x0, -32]
ldr d6, [x0, -24]
ldr d5, [x0, -16]
ldr d4, [x0, -8]
fmadd d3, d7, d2, d3
fmadd d18, d7, d1, d18
fmadd d21, d6, d2, d21
fmadd d20, d5, d2, d20
fmadd d19, d4, d2, d19
fmadd d0, d6, d1, d0
fmadd d17, d5, d1, d17
fmadd d16, d4, d1, d16
bne .L3
.L2:
fadd d2, d3, d21
fadd d3, d2, d20
fadd d3, d3, d19
fadd d1, d3, d18
fadd d1, d1, d0
fadd d0, d1, d17
fadd d0, d0, d16
ret
.size zgemm, .-zgemm
.ident "GCC: (Ubuntu/Linaro 4.9.2-10ubuntu13) 4.9.2"
.section .note.GNU-stack,"",%progbits
***********************************************************
As you can see 6 DOUBLEs are loaded in the loop body.
GCC uses 6 LDR instructions for this.
But alternatively 3 LDP instructions could be used.
***********************************************************
I hope this report is helpfull to you.
Please tell me if you need anything else
***********************************************************
More information about the Gcc-bugs
mailing list