This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug rtl-optimization/67577] New: Trivial float-vectorization foiled by a loop
- From: "bisqwit at iki dot fi" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Mon, 14 Sep 2015 17:23:30 +0000
- Subject: [Bug rtl-optimization/67577] New: Trivial float-vectorization foiled by a loop
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67577
Bug ID: 67577
Summary: Trivial float-vectorization foiled by a loop
Product: gcc
Version: 5.2.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: bisqwit at iki dot fi
Target Milestone: ---
This code is written as if tailored to be SIMD-optimized by GCC...
But GCC somehow blows it.
template<typename T, unsigned N>
struct vec
{
T d[N];
vec<T,N> operator* (const T& b)
{
vec<T,N> result;
for(unsigned n=0u; n<N; ++n) result.d[n] = d[n] * b;
return result;
}
vec<T,N> operator+ (const vec<T,N>& b)
{
vec<T,N> result;
for(unsigned n=0u; n<N; ++n) result.d[n] = d[n] + b.d[n];
return result;
}
vec<T,N> operator- (const vec<T,N>& b)
{
vec<T,N> result;
for(unsigned n=0u; n<N; ++n) result.d[n] = d[n] - b.d[n];
return result;
}
};
float scale;
vec<float,8> a, b, c;
void x()
{
for(int n=0; n<1; ++n)
{
vec<float,8> result = b + (a - b) * scale;
c = result;
}
}
Generated code (inner loop):
movss b+4(%rip), %xmm6
movss a+4(%rip), %xmm7
subss %xmm6, %xmm7
movss scale(%rip), %xmm0
movss b+8(%rip), %xmm5
movss b+12(%rip), %xmm4
movss b+16(%rip), %xmm3
mulss %xmm0, %xmm7
movss b+20(%rip), %xmm1
movss b+24(%rip), %xmm2
movss b+28(%rip), %xmm9
movss b(%rip), %xmm8
addss %xmm6, %xmm7
movss a+8(%rip), %xmm6
subss %xmm5, %xmm6
movss %xmm7, c+4(%rip)
mulss %xmm0, %xmm6
addss %xmm5, %xmm6
movss a+12(%rip), %xmm5
subss %xmm4, %xmm5
movss %xmm6, c+8(%rip)
mulss %xmm0, %xmm5
addss %xmm4, %xmm5
movss a+16(%rip), %xmm4
subss %xmm3, %xmm4
movss %xmm5, c+12(%rip)
mulss %xmm0, %xmm4
addss %xmm3, %xmm4
movss a+20(%rip), %xmm3
subss %xmm1, %xmm3
movss %xmm4, c+16(%rip)
mulss %xmm0, %xmm3
addss %xmm1, %xmm3
movss a+24(%rip), %xmm1
subss %xmm2, %xmm1
movss %xmm3, c+20(%rip)
mulss %xmm0, %xmm1
addss %xmm2, %xmm1
movss a+28(%rip), %xmm2
subss %xmm9, %xmm2
movss %xmm1, c+24(%rip)
mulss %xmm0, %xmm2
addss %xmm9, %xmm2
movss a(%rip), %xmm9
subss %xmm8, %xmm9
movss %xmm2, c+28(%rip)
mulss %xmm9, %xmm0
addss %xmm8, %xmm0
movss %xmm0, c(%rip)
Platform: amd64; GCC version 5.2.1.
If I comment away the dummy for-loop, or I change the float "scale" variable
into a function parameter, the inner loop changes into a much simpler code that
vectorizes like I meant to:
movaps b(%rip), %xmm3
movaps b+16(%rip), %xmm1
movaps a+16(%rip), %xmm0
movaps a(%rip), %xmm2
subps %xmm1, %xmm0
movss scale(%rip), %xmm4
subps %xmm3, %xmm2
shufps $0, %xmm4, %xmm4
mulps %xmm4, %xmm0
mulps %xmm4, %xmm2
addps %xmm1, %xmm0
addps %xmm3, %xmm2
movaps %xmm0, -24(%rsp)
movq -16(%rsp), %rax
movaps %xmm2, -40(%rsp)
movq %xmm2, c(%rip)
movq %xmm0, c+16(%rip)
movq -32(%rsp), %rdx
movq %rax, c+24(%rip)
movq %rdx, c+8(%rip)
Although there's still some glitch in the generated code causing dummy memory
transfers, at least it now did the calculations using packed registers.
If I change the global "scale" variable into a function parameter, the
following shorter code is generated instead (essentially the same what Clang
successfully produces for all three cases).
movaps b+16(%rip), %xmm2
shufps $0, %xmm0, %xmm0
movaps a+16(%rip), %xmm1
subps %xmm2, %xmm1
movaps b(%rip), %xmm3
mulps %xmm0, %xmm1
addps %xmm2, %xmm1
movaps a(%rip), %xmm2
subps %xmm3, %xmm2
movaps %xmm1, c+16(%rip)
mulps %xmm2, %xmm0
addps %xmm3, %xmm0
movaps %xmm0, c(%rip)
Something causes GCC's tree-vectorization to be really rickety and easily
foiled by trivial changes in code, and I'd like to see it fixed at least in
these particular cases.