This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug c/79491] New: Possibly inefficient code for the inner product of two vectors
- From: "drraph at gmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Mon, 13 Feb 2017 16:41:37 +0000
- Subject: [Bug c/79491] New: Possibly inefficient code for the inner product of two vectors
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79491
Bug ID: 79491
Summary: Possibly inefficient code for the inner product of two
vectors
Product: gcc
Version: 7.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: drraph at gmail dot com
Target Milestone: ---
Consider:
float f(float x[], float y[]) {
float p = 0;
for (int i = 0; i <64; i++)
p += x[i] * y[i];
return p;
}
Using gcc 7 (snapshot) and -Ofast -march=core-avx2 you get:
f:
mov rax, rdi
shr rax, 2
neg rax
and eax, 7
je .L6
vmovss xmm0, DWORD PTR [rdi]
vmulss xmm1, xmm0, DWORD PTR [rsi]
cmp eax, 1
je .L7
vmovss xmm4, DWORD PTR [rdi+4]
vfmadd231ss xmm1, xmm4, DWORD PTR [rsi+4]
cmp eax, 2
je .L8
vmovss xmm3, DWORD PTR [rdi+8]
vfmadd231ss xmm1, xmm3, DWORD PTR [rsi+8]
cmp eax, 3
je .L9
vmovss xmm2, DWORD PTR [rdi+12]
vfmadd231ss xmm1, xmm2, DWORD PTR [rsi+12]
cmp eax, 4
je .L10
vmovss xmm3, DWORD PTR [rdi+16]
vfmadd231ss xmm1, xmm3, DWORD PTR [rsi+16]
cmp eax, 5
je .L11
vmovss xmm7, DWORD PTR [rdi+20]
vfmadd231ss xmm1, xmm7, DWORD PTR [rsi+20]
cmp eax, 7
jne .L12
vmovss xmm4, DWORD PTR [rsi+24]
vfmadd231ss xmm1, xmm4, DWORD PTR [rdi+24]
mov r9d, 57
mov r10d, 7
.L2:
mov ecx, 64
sub ecx, eax
mov eax, eax
sal rax, 2
mov r8d, ecx
lea rdx, [rdi+rax]
add rax, rsi
shr r8d, 3
vmovups ymm0, YMMWORD PTR [rax+32]
vmulps ymm0, ymm0, YMMWORD PTR [rdx+32]
vmovaps ymm3, YMMWORD PTR [rdx]
vfmadd231ps ymm0, ymm3, YMMWORD PTR [rax]
vmovaps ymm4, YMMWORD PTR [rdx+64]
vfmadd231ps ymm0, ymm4, YMMWORD PTR [rax+64]
vmovaps ymm5, YMMWORD PTR [rdx+96]
vfmadd231ps ymm0, ymm5, YMMWORD PTR [rax+96]
vmovaps ymm6, YMMWORD PTR [rdx+128]
vmovaps ymm7, YMMWORD PTR [rdx+160]
vfmadd231ps ymm0, ymm6, YMMWORD PTR [rax+128]
vmovaps ymm3, YMMWORD PTR [rdx+192]
vfmadd231ps ymm0, ymm7, YMMWORD PTR [rax+160]
vfmadd231ps ymm0, ymm3, YMMWORD PTR [rax+192]
cmp r8d, 8
jne .L4
vmovaps ymm4, YMMWORD PTR [rdx+224]
vfmadd231ps ymm0, ymm4, YMMWORD PTR [rax+224]
.L4:
vhaddps ymm0, ymm0, ymm0
mov r8d, ecx
mov edx, r9d
and r8d, -8
lea eax, [r8+r10]
sub edx, r8d
vhaddps ymm2, ymm0, ymm0
vperm2f128 ymm0, ymm2, ymm2, 1
vaddps ymm0, ymm0, ymm2
vaddss xmm0, xmm0, xmm1
cmp ecx, r8d
je .L31
movsx rcx, eax
vmovss xmm5, DWORD PTR [rdi+rcx*4]
vfmadd231ss xmm0, xmm5, DWORD PTR [rsi+rcx*4]
lea ecx, [rax+1]
cmp edx, 1
je .L31
movsx rcx, ecx
vmovss xmm6, DWORD PTR [rdi+rcx*4]
vfmadd231ss xmm0, xmm6, DWORD PTR [rsi+rcx*4]
lea ecx, [rax+2]
cmp edx, 2
je .L31
movsx rcx, ecx
vmovss xmm7, DWORD PTR [rdi+rcx*4]
vfmadd231ss xmm0, xmm7, DWORD PTR [rsi+rcx*4]
lea ecx, [rax+3]
cmp edx, 3
je .L31
movsx rcx, ecx
vmovss xmm2, DWORD PTR [rdi+rcx*4]
vfmadd231ss xmm0, xmm2, DWORD PTR [rsi+rcx*4]
lea ecx, [rax+4]
cmp edx, 4
je .L31
movsx rcx, ecx
vmovss xmm7, DWORD PTR [rdi+rcx*4]
vfmadd231ss xmm0, xmm7, DWORD PTR [rsi+rcx*4]
lea ecx, [rax+5]
cmp edx, 5
je .L31
movsx rcx, ecx
add eax, 6
vmovss xmm5, DWORD PTR [rdi+rcx*4]
vfmadd231ss xmm0, xmm5, DWORD PTR [rsi+rcx*4]
cmp edx, 6
je .L31
cdqe
vmovss xmm6, DWORD PTR [rdi+rax*4]
vfmadd231ss xmm0, xmm6, DWORD PTR [rsi+rax*4]
.L31:
vzeroupper
ret
.L10:
mov r9d, 60
mov r10d, 4
jmp .L2
.L7:
mov r9d, 63
mov r10d, 1
jmp .L2
.L6:
mov r9d, 64
xor r10d, r10d
vxorps xmm1, xmm1, xmm1
jmp .L2
.L8:
mov r9d, 62
mov r10d, 2
jmp .L2
.L9:
mov r9d, 61
mov r10d, 3
jmp .L2
.L11:
mov r9d, 59
mov r10d, 5
jmp .L2
.L12:
mov r9d, 58
mov r10d, 6
jmp .L2
However this seems more efficient from clang trunk:
f: # @f
vmovups ymm0, ymmword ptr [rsi]
vmovups ymm1, ymmword ptr [rsi + 32]
vmovups ymm2, ymmword ptr [rsi + 64]
vmovups ymm3, ymmword ptr [rsi + 96]
vmulps ymm0, ymm0, ymmword ptr [rdi]
vfmadd231ps ymm0, ymm1, ymmword ptr [rdi + 32]
vfmadd231ps ymm0, ymm2, ymmword ptr [rdi + 64]
vfmadd231ps ymm0, ymm3, ymmword ptr [rdi + 96]
vmovups ymm1, ymmword ptr [rsi + 128]
vfmadd132ps ymm1, ymm0, ymmword ptr [rdi + 128]
vmovups ymm0, ymmword ptr [rsi + 160]
vfmadd132ps ymm0, ymm1, ymmword ptr [rdi + 160]
vmovups ymm1, ymmword ptr [rsi + 192]
vfmadd132ps ymm1, ymm0, ymmword ptr [rdi + 192]
vmovups ymm0, ymmword ptr [rsi + 224]
vfmadd132ps ymm0, ymm1, ymmword ptr [rdi + 224]
vextractf128 xmm1, ymm0, 1
vaddps ymm0, ymm0, ymm1
vpermilpd xmm1, xmm0, 1 # xmm1 = xmm0[1,0]
vaddps ymm0, ymm0, ymm1
vhaddps ymm0, ymm0, ymm0
vzeroupper
ret
It seems that gcc is going to some lengths to align the data which may not be
worth the cost.