[Bug target/108229] New: [13 Regression] unprofitable STV transform
amonakov at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Mon Dec 26 17:38:12 GMT 2022
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229
Bug ID: 108229
Summary: [13 Regression] unprofitable STV transform
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: amonakov at gcc dot gnu.org
Target Milestone: ---
Target: x86_64-*-*
In the following example, STV is making a very unprofitable transformation on
trunk, but not on gcc-12:
#include <stddef.h>
#include <stdint.h>
struct b {
struct b *next;
uint64_t data[511];
};
typedef uint64_t u64v2 __attribute__((vector_size(16)));
static inline
void vsum(u64v2 s[], uint64_t *x, size_t n)
{
typedef u64v2 u64v2_u __attribute__((may_alias));
u64v2_u *vx = (void *)x;
for (; n; vx += 4, n -= 8) {
s[0] += vx[0];
s[1] += vx[1];
s[2] += vx[2];
s[3] += vx[3];
}
}
uint64_t sum(struct b *b)
{
uint64_t s = 0;
u64v2 vs[4] = { 0 };
do {
vsum(vs, b->data + 7, 511-7);
#pragma GCC unroll(7)
for (int i = 0; i < 7; i++)
s += b->data[i];
} while ((b = b->next));
vs[0] += vs[1] + vs[2] + vs[3];
return s + vs[0][0] + vs[0][1];
}
gcc -O2 -mavx (-mavx is not necessary, plain -O2 also triggers it):
sum:
vpxor xmm2, xmm2, xmm2
vmovdqa xmm1, xmm2
vmovdqa xmm3, xmm2
vmovdqa xmm0, xmm2
vmovdqa xmm5, xmm2
.L3:
lea rax, [rdi+64]
lea rdx, [rdi+4096]
.L2:
vpaddq xmm0, xmm0, XMMWORD PTR [rax]
vpaddq xmm3, xmm3, XMMWORD PTR [rax+16]
add rax, 64
vpaddq xmm1, xmm1, XMMWORD PTR [rax-32]
vpaddq xmm2, xmm2, XMMWORD PTR [rax-16]
cmp rdx, rax
jne .L2
vmovq xmm6, QWORD PTR [rdi+16]
vmovq xmm4, QWORD PTR [rdi+8]
vpaddq xmm4, xmm4, xmm6
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+24]
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+32]
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+40]
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+48]
vpaddq xmm4, xmm4, xmm5
vmovq xmm5, QWORD PTR [rdi+56]
mov rdi, QWORD PTR [rdi]
vpaddq xmm5, xmm4, xmm5
test rdi, rdi
jne .L3
vpaddq xmm1, xmm1, xmm2
vpaddq xmm0, xmm0, xmm3
vpaddq xmm0, xmm0, xmm1
vmovdqa xmm1, xmm0
vpsrldq xmm0, xmm0, 8
vpaddq xmm0, xmm1, xmm0
vpaddq xmm0, xmm0, xmm5
vmovq rax, xmm0
ret
compare with gcc -O2 -mavx -mno-stv:
sum:
vpxor xmm2, xmm2, xmm2
xor edx, edx
vmovdqa xmm1, xmm2
vmovdqa xmm3, xmm2
vmovdqa xmm0, xmm2
.L3:
lea rax, [rdi+64]
lea rcx, [rdi+4096]
.L2:
vpaddq xmm0, xmm0, XMMWORD PTR [rax]
vpaddq xmm3, xmm3, XMMWORD PTR [rax+16]
add rax, 64
vpaddq xmm1, xmm1, XMMWORD PTR [rax-32]
vpaddq xmm2, xmm2, XMMWORD PTR [rax-16]
cmp rcx, rax
jne .L2
mov rax, QWORD PTR [rdi+16]
add rax, QWORD PTR [rdi+8]
add rdx, rax
add rdx, QWORD PTR [rdi+24]
add rdx, QWORD PTR [rdi+32]
add rdx, QWORD PTR [rdi+40]
add rdx, QWORD PTR [rdi+48]
add rdx, QWORD PTR [rdi+56]
mov rdi, QWORD PTR [rdi]
test rdi, rdi
jne .L3
vpaddq xmm0, xmm0, xmm3
vpaddq xmm1, xmm1, xmm2
vpaddq xmm0, xmm0, xmm1
vmovq rcx, xmm0
vpextrq rax, xmm0, 1
add rax, rcx
add rax, rdx
ret
More information about the Gcc-bugs
mailing list