This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/84106] gcc is not able to vectorize code for 1D array, but does so for 2D array of the same size
- From: "bugzilla at poradnik-webmastera dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Tue, 30 Jan 2018 12:53:19 +0000
- Subject: [Bug tree-optimization/84106] gcc is not able to vectorize code for 1D array, but does so for 2D array of the same size
- Auto-submitted: auto-generated
- References: <bug-84106-4@http.gcc.gnu.org/bugzilla/>
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84106
--- Comment #2 from Daniel Fruzynski <bugzilla@poradnik-webmastera.com> ---
Test included in comment 0 is part of bigger test which I performed. In full
version code was also computing bitmask and stored in 3rd array. For test1 gcc
was able to vectorize inner loop to series of load-shift-store-store
operations. In test2 it separated loops into two - 1st one performing memcpy
using "rep movsq", 2nd one calculating bitmasks using vector instructions. Here
is full code and output:
[code]
#include <stdint.h>
#define N 9
int a1[N][N];
int a2[N][N];
int a3[N][N];
int b1[N*N];
int b2[N*N];
int b3[N*N];
void test1()
{
for (int i = 0; i < N; ++i)
{
for (int j = 0; j < N; ++j)
{
a2[i][j] = a1[i][j];
a3[i][j] = 1u << (uint8_t)a1[i][j];
}
}
}
void test2()
{
for (int i = 0; i < N*N; ++i)
{
b2[i] = b1[i];
b3[i] = 1u << b1[i];
}
}
[/code]
[out]
test1():
vmovdqa ymm0, YMMWORD PTR .LC0[rip]
vpsllvd ymm1, ymm0, YMMWORD PTR a1[rip]
mov eax, 1
vmovdqa ymm5, YMMWORD PTR a1[rip+96]
vmovdqa ymm6, YMMWORD PTR a1[rip+128]
vmovdqa ymm7, YMMWORD PTR a1[rip+160]
vmovdqa ymm2, YMMWORD PTR a1[rip]
vmovdqa YMMWORD PTR a3[rip], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR a1[rip+32]
vmovdqa ymm3, YMMWORD PTR a1[rip+32]
vmovdqa YMMWORD PTR a2[rip], ymm2
vmovdqa ymm2, YMMWORD PTR a1[rip+192]
vmovdqa ymm4, YMMWORD PTR a1[rip+64]
vmovdqa YMMWORD PTR a2[rip+32], ymm3
vmovdqa ymm3, YMMWORD PTR a1[rip+224]
vmovdqa YMMWORD PTR a3[rip+32], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR a1[rip+64]
vmovdqa YMMWORD PTR a2[rip+64], ymm4
vmovdqa ymm4, YMMWORD PTR a1[rip+256]
vmovdqa YMMWORD PTR a2[rip+96], ymm5
vmovdqa YMMWORD PTR a3[rip+64], ymm1
vpsllvd ymm1, ymm0, ymm5
vmovdqa ymm5, YMMWORD PTR a1[rip+288]
vmovdqa YMMWORD PTR a2[rip+128], ymm6
vmovdqa YMMWORD PTR a3[rip+96], ymm1
vpsllvd ymm1, ymm0, ymm6
vmovdqa YMMWORD PTR a2[rip+160], ymm7
vmovdqa YMMWORD PTR a3[rip+128], ymm1
vpsllvd ymm1, ymm0, ymm7
vmovdqa YMMWORD PTR a2[rip+192], ymm2
vmovdqa YMMWORD PTR a3[rip+160], ymm1
vpsllvd ymm1, ymm0, ymm2
vmovdqa YMMWORD PTR a2[rip+224], ymm3
vmovdqa YMMWORD PTR a3[rip+192], ymm1
vpsllvd ymm1, ymm0, ymm3
vmovdqa YMMWORD PTR a2[rip+256], ymm4
vmovdqa YMMWORD PTR a3[rip+224], ymm1
vpsllvd ymm1, ymm0, ymm4
vpsllvd ymm0, ymm0, ymm5
vmovdqa YMMWORD PTR a3[rip+256], ymm1
vmovdqa YMMWORD PTR a2[rip+288], ymm5
mov ecx, DWORD PTR a1[rip+320]
vmovdqa YMMWORD PTR a3[rip+288], ymm0
sal eax, cl
mov DWORD PTR a2[rip+320], ecx
mov DWORD PTR a3[rip+320], eax
vzeroupper
ret
test2():
mov esi, OFFSET FLAT:b1
mov edi, OFFSET FLAT:b2
mov ecx, 40
vmovdqa ymm0, YMMWORD PTR .LC0[rip]
rep movsq
vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip]
mov ecx, DWORD PTR b1[rip+320]
vmovdqa YMMWORD PTR b3[rip], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+32]
vmovdqa YMMWORD PTR b3[rip+32], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+64]
mov eax, DWORD PTR [rsi]
mov DWORD PTR [rdi], eax
mov eax, 1
vmovdqa YMMWORD PTR b3[rip+64], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+96]
sal eax, cl
mov DWORD PTR b3[rip+320], eax
vmovdqa YMMWORD PTR b3[rip+96], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+128]
vmovdqa YMMWORD PTR b3[rip+128], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+160]
vmovdqa YMMWORD PTR b3[rip+160], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+192]
vmovdqa YMMWORD PTR b3[rip+192], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+224]
vmovdqa YMMWORD PTR b3[rip+224], ymm1
vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+256]
vpsllvd ymm0, ymm0, YMMWORD PTR b1[rip+288]
vmovdqa YMMWORD PTR b3[rip+256], ymm1
vmovdqa YMMWORD PTR b3[rip+288], ymm0
vzeroupper
ret
b3:
.zero 324
b2:
.zero 324
b1:
.zero 324
a3:
.zero 324
a2:
.zero 324
a1:
.zero 324
.LC0:
.long 1
.long 1
.long 1
.long 1
.long 1
.long 1
.long 1
.long 1
[/out]