This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug middle-end/88361] New: gcc does not unroll loop


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88361

            Bug ID: 88361
           Summary: gcc does not unroll loop
           Product: gcc
           Version: 8.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: bugzilla@poradnik-webmastera.com
  Target Milestone: ---

[code]
#include "immintrin.h"

#define SIZE 9

int src[SIZE][SIZE] __attribute__((aligned(16)));
int dst1[SIZE][SIZE] __attribute__((aligned(16)));
int dst2[SIZE][SIZE] __attribute__((aligned(16)));

void test1()
{
    for (int i = 0; i < SIZE; ++i)
    {
        for (int j = 0; j < SIZE; ++j)
        {
            dst1[i][j] = src[i][j];
            dst2[i][j] = 1u << src[i][j];
        }
    }
}

#pragma GCC push_options
#pragma GCC optimize ("unroll-loops")
void test2()
{
    int n = 0;
    for (; n < SIZE*SIZE-3; n += 4)
    {
        // Copy data
        __m128i v = _mm_load_si128((const __m128i*)(&src[0][0] + n));
        _mm_store_si128((__m128i*)(&dst1[0][0] + n), v);

        // Calculate bitmasks
        v = _mm_sllv_epi32(_mm_set1_epi32(1), v);
        _mm_store_si128((__m128i*)(&dst2[0][0] + n), v);
    }

    for (; n < SIZE*SIZE; n++)
    {
        int x = *(&src[0][0] + n);
        *((&dst1[0][0] + n)) = x;
        *((&dst2[0][0] + n)) = 1 << x;
    }
}
#pragma GCC pop_options
[/code]

When code above is compiled using gcc 8.2 with -O3 -mavx2 -mprefer-avx128,
loops in test1() are unrolled and vectorized as expected. However in test2()
loops are not unrolled completely, even with unroll pragma:

[asm]
test2():
  mov eax, OFFSET FLAT:dst1
  mov esi, OFFSET FLAT:src
  mov ecx, 40
  xor edx, edx
  mov rdi, rax
  vmovdqa xmm1, XMMWORD PTR .LC0[rip]
  rep movsq
.L4:
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rdx]
  lea rax, [rdx+16]
  vmovaps XMMWORD PTR dst2[rdx], xmm0
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rdx+16]
  vmovaps XMMWORD PTR dst2[rax], xmm0
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rdx+32]
  vmovaps XMMWORD PTR dst2[rdx+32], xmm0
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rax+32]
  lea rdx, [rax+144]
  vmovaps XMMWORD PTR dst2[rax+32], xmm0
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rax+48]
  vmovaps XMMWORD PTR dst2[rax+48], xmm0
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rax+64]
  vmovaps XMMWORD PTR dst2[rax+64], xmm0
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rax+80]
  vmovaps XMMWORD PTR dst2[rax+80], xmm0
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rax+96]
  vmovaps XMMWORD PTR dst2[rax+96], xmm0
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rax+112]
  vmovaps XMMWORD PTR dst2[rax+112], xmm0
  vpsllvd xmm0, xmm1, XMMWORD PTR src[rax+128]
  vmovaps XMMWORD PTR dst2[rax+128], xmm0
  cmp rax, 176
  jne .L4
  mov ecx, DWORD PTR src[rip+320]
  mov eax, 1
  sal eax, cl
  mov DWORD PTR dst1[rip+320], ecx
  mov DWORD PTR dst2[rip+320], eax
  ret
[/asm]

This issue also exists in gcc 8.2 for AARCH64. I found it there first, and then
checked that on x86_64 it is also present.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]