[Bug target/68233] New: Performance : GCC not uses possible LDP-Instruction on ARM64

gunnar.von.boehn at huawei dot com gcc-bugzilla@gcc.gnu.org
Fri Nov 6 09:32:00 GMT 2015


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68233

            Bug ID: 68233
           Summary: Performance : GCC not uses possible LDP-Instruction on
                    ARM64
           Product: gcc
           Version: 4.9.2
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gunnar.von.boehn at huawei dot com
  Target Milestone: ---

Dear List,

it seems to me that GCC not fully utilizes available LDP-instruction on ARM64.
On Cortex-A57 the LDP instruction could load 2 64bit registers in 1 cycle,
while when using LDR-instructions only 1 can be loaded.


****************************************************************

I have ARM64 / Cortex-A57 System: 
acc@linaro-nano:~/minibench9$ cat /proc/cpuinfo
processor       : 0
Features        : fp asimd evtstrm aes pmull sha1 sha2 crc32
CPU implementer : 0x41
CPU architecture: 8
CPU variant     : 0x1
CPU part        : 0xd07
CPU revision    : 1

****************************************************************

gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/aarch64-linux-gnu/4.9/lto-wrapper
Target: aarch64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro
4.9.2-10ubuntu13' --with-bugurl=file:///usr/share/doc/gcc-4.9/README.Bugs
--enable-languages=c,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr
--program-suffix=-4.9 --enable-shared --enable-linker-build-id
--libexecdir=/usr/lib --without-included-gettext --enable-threads=posix
--with-gxx-include-dir=/usr/include/c++/4.9 --libdir=/usr/lib --enable-nls
--with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug
--enable-libstdcxx-time=yes --enable-gnu-unique-object --disable-libsanitizer
--disable-libquadmath --enable-plugin --with-system-zlib
--disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo
--with-java-home=/usr/lib/jvm/java-1.5.0-gcj-4.9-arm64/jre --enable-java-home
--with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-4.9-arm64
--with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-4.9-arm64
--with-arch-directory=arm64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar
--enable-multiarch --disable-werror --enable-checking=release
--build=aarch64-linux-gnu --host=aarch64-linux-gnu --target=aarch64-linux-gnu
Thread model: posix
gcc version 4.9.2 (Ubuntu/Linaro 4.9.2-10ubuntu13)

****************************************************************
compile options:

gcc -O2 -S -mcpu=cortex-A57 -mtune=cortex-A57 ldp.c

****************************************************************

C Source:

acc@linaro-nano:~/minibench9$ cat ldp.c
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <limits.h>
#include <arm_neon.h>

 double zgemm(double * ptrba, double * ptrbb, size_t bk){
        double  load0,load1,load2,load3,load4,load5;
        double  res0,res1,res2,res3,res4,res5,res6,res7;

             for( ; bk; bk--)
               {

                  load0 = ptrba[4*0+0];
                  load2 = ptrba[4*0+1];
                  load4 = ptrba[4*0+2];
                  load5 = ptrba[4*0+3];

                  load1 = ptrbb[4*0+0];
                  load3 = ptrbb[4*0+1];

                  res0 = res0+load0*load1;
                  res1 = res1+load2*load1;
                  res2 = res2+load4*load1;
                  res3 = res3+load5*load1;
                  res4 = res4+load0*load3;
                  res5 = res5+load2*load3;
                  res6 = res6+load4*load3;
                  res7 = res7+load5*load3;

                  ptrba += 4;
                  ptrbb += 2;

                }
                res0 += res1;
                res0 += res2;
                res0 += res3;
                res0 += res4;
                res0 += res5;
                res0 += res6;
                res0 += res7;
                return res0;
        }

***********************************************************
Created ASM code:


acc@linaro-nano:~/minibench9$ cat ldp.s
        .cpu cortex-a57+fp+simd+crc
        .file   "ldp.c"
        .text
        .align  2
        .global zgemm
        .type   zgemm, %function
zgemm:
        cbz     x2, .L2
.L3:
        ldr     d2, [x1]
        subs    x2, x2, #1
        add     x0, x0, 32
        ldr     d1, [x1, 8]
        add     x1, x1, 16
        ldr     d7, [x0, -32]
        ldr     d6, [x0, -24]
        ldr     d5, [x0, -16]
        ldr     d4, [x0, -8]
        fmadd   d3, d7, d2, d3
        fmadd   d18, d7, d1, d18
        fmadd   d21, d6, d2, d21
        fmadd   d20, d5, d2, d20
        fmadd   d19, d4, d2, d19
        fmadd   d0, d6, d1, d0
        fmadd   d17, d5, d1, d17
        fmadd   d16, d4, d1, d16
        bne     .L3
.L2:
        fadd    d2, d3, d21
        fadd    d3, d2, d20
        fadd    d3, d3, d19
        fadd    d1, d3, d18
        fadd    d1, d1, d0
        fadd    d0, d1, d17
        fadd    d0, d0, d16
        ret
        .size   zgemm, .-zgemm
        .ident  "GCC: (Ubuntu/Linaro 4.9.2-10ubuntu13) 4.9.2"
        .section        .note.GNU-stack,"",%progbits

***********************************************************

As you can see 6 DOUBLEs are loaded in the loop body.
GCC uses 6 LDR instructions for this.
But alternatively 3 LDP instructions could be used.


***********************************************************
I hope this report is helpfull to you.
Please tell me if you need anything else

***********************************************************


More information about the Gcc-bugs mailing list