[Bug target/108229] New: [13 Regression] unprofitable STV transform

Mon Dec 26 17:38:12 GMT 2022

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229

            Bug ID: 108229
           Summary: [13 Regression] unprofitable STV transform
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: amonakov at gcc dot gnu.org
  Target Milestone: ---
            Target: x86_64-*-*

In the following example, STV is making a very unprofitable transformation on
trunk, but not on gcc-12:

#include <stddef.h>
#include <stdint.h>

struct b {
        struct b *next;
        uint64_t data[511];
};

typedef uint64_t u64v2 __attribute__((vector_size(16)));
static inline
void vsum(u64v2 s[], uint64_t *x, size_t n)
{
        typedef u64v2 u64v2_u __attribute__((may_alias));
        u64v2_u *vx = (void *)x;
        for (; n; vx += 4, n -= 8) {
                s[0] += vx[0];
                s[1] += vx[1];
                s[2] += vx[2];
                s[3] += vx[3];
        }
}

uint64_t sum(struct b *b)
{
        uint64_t s = 0;
        u64v2 vs[4] = { 0 };
        do {
                vsum(vs, b->data + 7, 511-7);
#pragma GCC unroll(7)
                for (int i = 0; i < 7; i++)
                        s += b->data[i];
        } while ((b = b->next));
        vs[0] += vs[1] + vs[2] + vs[3];
        return s + vs[0][0] + vs[0][1];
}

gcc -O2 -mavx (-mavx is not necessary, plain -O2 also triggers it):

sum:
        vpxor   xmm2, xmm2, xmm2
        vmovdqa xmm1, xmm2
        vmovdqa xmm3, xmm2
        vmovdqa xmm0, xmm2
        vmovdqa xmm5, xmm2
.L3:
        lea     rax, [rdi+64]
        lea     rdx, [rdi+4096]
.L2:
        vpaddq  xmm0, xmm0, XMMWORD PTR [rax]
        vpaddq  xmm3, xmm3, XMMWORD PTR [rax+16]
        add     rax, 64
        vpaddq  xmm1, xmm1, XMMWORD PTR [rax-32]
        vpaddq  xmm2, xmm2, XMMWORD PTR [rax-16]
        cmp     rdx, rax
        jne     .L2
        vmovq   xmm6, QWORD PTR [rdi+16]
        vmovq   xmm4, QWORD PTR [rdi+8]
        vpaddq  xmm4, xmm4, xmm6
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+24]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+32]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+40]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+48]
        vpaddq  xmm4, xmm4, xmm5
        vmovq   xmm5, QWORD PTR [rdi+56]
        mov     rdi, QWORD PTR [rdi]
        vpaddq  xmm5, xmm4, xmm5
        test    rdi, rdi
        jne     .L3
        vpaddq  xmm1, xmm1, xmm2
        vpaddq  xmm0, xmm0, xmm3
        vpaddq  xmm0, xmm0, xmm1
        vmovdqa xmm1, xmm0
        vpsrldq xmm0, xmm0, 8
        vpaddq  xmm0, xmm1, xmm0
        vpaddq  xmm0, xmm0, xmm5
        vmovq   rax, xmm0
        ret

compare with gcc -O2 -mavx -mno-stv:

sum:
        vpxor   xmm2, xmm2, xmm2
        xor     edx, edx
        vmovdqa xmm1, xmm2
        vmovdqa xmm3, xmm2
        vmovdqa xmm0, xmm2
.L3:
        lea     rax, [rdi+64]
        lea     rcx, [rdi+4096]
.L2:
        vpaddq  xmm0, xmm0, XMMWORD PTR [rax]
        vpaddq  xmm3, xmm3, XMMWORD PTR [rax+16]
        add     rax, 64
        vpaddq  xmm1, xmm1, XMMWORD PTR [rax-32]
        vpaddq  xmm2, xmm2, XMMWORD PTR [rax-16]
        cmp     rcx, rax
        jne     .L2
        mov     rax, QWORD PTR [rdi+16]
        add     rax, QWORD PTR [rdi+8]
        add     rdx, rax
        add     rdx, QWORD PTR [rdi+24]
        add     rdx, QWORD PTR [rdi+32]
        add     rdx, QWORD PTR [rdi+40]
        add     rdx, QWORD PTR [rdi+48]
        add     rdx, QWORD PTR [rdi+56]
        mov     rdi, QWORD PTR [rdi]
        test    rdi, rdi
        jne     .L3
        vpaddq  xmm0, xmm0, xmm3
        vpaddq  xmm1, xmm1, xmm2
        vpaddq  xmm0, xmm0, xmm1
        vmovq   rcx, xmm0
        vpextrq rax, xmm0, 1
        add     rax, rcx
        add     rax, rdx
        ret