This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[Bug c/79491] New: Possibly inefficient code for the inner product of two vectors

From: "drraph at gmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
To: gcc-bugs at gcc dot gnu dot org
Date: Mon, 13 Feb 2017 16:41:37 +0000
Subject: [Bug c/79491] New: Possibly inefficient code for the inner product of two vectors
Auto-submitted: auto-generated

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79491

            Bug ID: 79491
           Summary: Possibly inefficient code for the inner product of two
                    vectors
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: drraph at gmail dot com
  Target Milestone: ---

Consider:

float f(float x[], float y[]) {
  float p = 0;
  for (int i = 0; i <64; i++)
    p += x[i] * y[i];
  return p;
}

Using gcc 7 (snapshot) and  -Ofast -march=core-avx2  you get:

f:
        mov     rax, rdi
        shr     rax, 2
        neg     rax
        and     eax, 7
        je      .L6
        vmovss  xmm0, DWORD PTR [rdi]
        vmulss  xmm1, xmm0, DWORD PTR [rsi]
        cmp     eax, 1
        je      .L7
        vmovss  xmm4, DWORD PTR [rdi+4]
        vfmadd231ss     xmm1, xmm4, DWORD PTR [rsi+4]
        cmp     eax, 2
        je      .L8
        vmovss  xmm3, DWORD PTR [rdi+8]
        vfmadd231ss     xmm1, xmm3, DWORD PTR [rsi+8]
        cmp     eax, 3
        je      .L9
        vmovss  xmm2, DWORD PTR [rdi+12]
        vfmadd231ss     xmm1, xmm2, DWORD PTR [rsi+12]
        cmp     eax, 4
        je      .L10
        vmovss  xmm3, DWORD PTR [rdi+16]
        vfmadd231ss     xmm1, xmm3, DWORD PTR [rsi+16]
        cmp     eax, 5
        je      .L11
        vmovss  xmm7, DWORD PTR [rdi+20]
        vfmadd231ss     xmm1, xmm7, DWORD PTR [rsi+20]
        cmp     eax, 7
        jne     .L12
        vmovss  xmm4, DWORD PTR [rsi+24]
        vfmadd231ss     xmm1, xmm4, DWORD PTR [rdi+24]
        mov     r9d, 57
        mov     r10d, 7
.L2:
        mov     ecx, 64
        sub     ecx, eax
        mov     eax, eax
        sal     rax, 2
        mov     r8d, ecx
        lea     rdx, [rdi+rax]
        add     rax, rsi
        shr     r8d, 3
        vmovups ymm0, YMMWORD PTR [rax+32]
        vmulps  ymm0, ymm0, YMMWORD PTR [rdx+32]
        vmovaps ymm3, YMMWORD PTR [rdx]
        vfmadd231ps     ymm0, ymm3, YMMWORD PTR [rax]
        vmovaps ymm4, YMMWORD PTR [rdx+64]
        vfmadd231ps     ymm0, ymm4, YMMWORD PTR [rax+64]
        vmovaps ymm5, YMMWORD PTR [rdx+96]
        vfmadd231ps     ymm0, ymm5, YMMWORD PTR [rax+96]
        vmovaps ymm6, YMMWORD PTR [rdx+128]
        vmovaps ymm7, YMMWORD PTR [rdx+160]
        vfmadd231ps     ymm0, ymm6, YMMWORD PTR [rax+128]
        vmovaps ymm3, YMMWORD PTR [rdx+192]
        vfmadd231ps     ymm0, ymm7, YMMWORD PTR [rax+160]
        vfmadd231ps     ymm0, ymm3, YMMWORD PTR [rax+192]
        cmp     r8d, 8
        jne     .L4
        vmovaps ymm4, YMMWORD PTR [rdx+224]
        vfmadd231ps     ymm0, ymm4, YMMWORD PTR [rax+224]
.L4:
        vhaddps ymm0, ymm0, ymm0
        mov     r8d, ecx
        mov     edx, r9d
        and     r8d, -8
        lea     eax, [r8+r10]
        sub     edx, r8d
        vhaddps ymm2, ymm0, ymm0
        vperm2f128      ymm0, ymm2, ymm2, 1
        vaddps  ymm0, ymm0, ymm2
        vaddss  xmm0, xmm0, xmm1
        cmp     ecx, r8d
        je      .L31
        movsx   rcx, eax
        vmovss  xmm5, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm5, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+1]
        cmp     edx, 1
        je      .L31
        movsx   rcx, ecx
        vmovss  xmm6, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm6, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+2]
        cmp     edx, 2
        je      .L31
        movsx   rcx, ecx
        vmovss  xmm7, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm7, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+3]
        cmp     edx, 3
        je      .L31
        movsx   rcx, ecx
        vmovss  xmm2, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm2, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+4]
        cmp     edx, 4
        je      .L31
        movsx   rcx, ecx
        vmovss  xmm7, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm7, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+5]
        cmp     edx, 5
        je      .L31
        movsx   rcx, ecx
        add     eax, 6
        vmovss  xmm5, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm5, DWORD PTR [rsi+rcx*4]
        cmp     edx, 6
        je      .L31
        cdqe
        vmovss  xmm6, DWORD PTR [rdi+rax*4]
        vfmadd231ss     xmm0, xmm6, DWORD PTR [rsi+rax*4]
.L31:
        vzeroupper
        ret
.L10:
        mov     r9d, 60
        mov     r10d, 4
        jmp     .L2
.L7:
        mov     r9d, 63
        mov     r10d, 1
        jmp     .L2
.L6:
        mov     r9d, 64
        xor     r10d, r10d
        vxorps  xmm1, xmm1, xmm1
        jmp     .L2
.L8:
        mov     r9d, 62
        mov     r10d, 2
        jmp     .L2
.L9:
        mov     r9d, 61
        mov     r10d, 3
        jmp     .L2
.L11:
        mov     r9d, 59
        mov     r10d, 5
        jmp     .L2
.L12:
        mov     r9d, 58
        mov     r10d, 6
        jmp     .L2

However this seems more efficient from clang trunk:

f:                                      # @f
        vmovups ymm0, ymmword ptr [rsi]
        vmovups ymm1, ymmword ptr [rsi + 32]
        vmovups ymm2, ymmword ptr [rsi + 64]
        vmovups ymm3, ymmword ptr [rsi + 96]
        vmulps  ymm0, ymm0, ymmword ptr [rdi]
        vfmadd231ps     ymm0, ymm1, ymmword ptr [rdi + 32]
        vfmadd231ps     ymm0, ymm2, ymmword ptr [rdi + 64]
        vfmadd231ps     ymm0, ymm3, ymmword ptr [rdi + 96]
        vmovups ymm1, ymmword ptr [rsi + 128]
        vfmadd132ps     ymm1, ymm0, ymmword ptr [rdi + 128]
        vmovups ymm0, ymmword ptr [rsi + 160]
        vfmadd132ps     ymm0, ymm1, ymmword ptr [rdi + 160]
        vmovups ymm1, ymmword ptr [rsi + 192]
        vfmadd132ps     ymm1, ymm0, ymmword ptr [rdi + 192]
        vmovups ymm0, ymmword ptr [rsi + 224]
        vfmadd132ps     ymm0, ymm1, ymmword ptr [rdi + 224]
        vextractf128    xmm1, ymm0, 1
        vaddps  ymm0, ymm0, ymm1
        vpermilpd       xmm1, xmm0, 1   # xmm1 = xmm0[1,0]
        vaddps  ymm0, ymm0, ymm1
        vhaddps ymm0, ymm0, ymm0
        vzeroupper
        ret


It seems that gcc is going to some lengths to align the data which may not be
worth the cost.

Follow-Ups:
- [Bug c/79491] Possibly inefficient code for the inner product of two vectors
  - From: glisse at gcc dot gnu.org
- [Bug c/79491] Possibly inefficient code for the inner product of two vectors
  - From: hjl.tools at gmail dot com
- [Bug target/79491] Possibly inefficient code for the inner product of two vectors
  - From: rguenth at gcc dot gnu.org

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]