[Bug c++/87105] New: Autovectorization [X86, SSE2, AVX2, DoublePrecision]
kobalicek.petr at gmail dot com
gcc-bugzilla@gcc.gnu.org
Sat Aug 25 22:58:00 GMT 2018
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87105
Bug ID: 87105
Summary: Autovectorization [X86, SSE2, AVX2, DoublePrecision]
Product: gcc
Version: unknown
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: kobalicek.petr at gmail dot com
Target Milestone: ---
GCC is unable to autovectorize the following code. It seems that it doesn't
like min/max, but I'm not entirely sure. I stripped the code off my project so
it's a bit longer, hope that's fine. I attached also a code compiled by clang,
which is perfectly vectorized and what I would like to get from GCC.
The demonstration code
----------------------
#include <algorithm>
#include <cmath>
#include <stdint.h>
// Point structure [x, y]
struct Point {
double x, y;
inline Point() noexcept = default;
constexpr Point(const Point&) noexcept = default;
constexpr Point(double x, double y) noexcept
: x(x), y(y) {}
};
// Box structure [x0, y0, x1, y1]
struct Box {
double x0, y0, x1, y1;
inline void reset(double x0, double y0, double x1, double y1) noexcept {
this->x0 = x0;
this->y0 = y0;
this->x1 = x1;
this->y1 = y1;
}
};
// Overloads to make vector processing simpler.
static constexpr Point operator-(const Point& a) noexcept { return Point(-a.x,
-a.y); }
static constexpr Point operator+(const Point& a, double b) noexcept
{ return Point(a.x + b, a.y + b); }
static constexpr Point operator-(const Point& a, double b) noexcept
{ return Point(a.x - b, a.y - b); }
static constexpr Point operator*(const Point& a, double b) noexcept
{ return Point(a.x * b, a.y * b); }
static constexpr Point operator/(const Point& a, double b) noexcept
{ return Point(a.x / b, a.y / b); }
static constexpr Point operator+(const Point& a, const Point& b) noexcept
{ return Point(a.x + b.x, a.y + b.y); }
static constexpr Point operator-(const Point& a, const Point& b) noexcept
{ return Point(a.x - b.x, a.y - b.y); }
static constexpr Point operator*(const Point& a, const Point& b) noexcept
{ return Point(a.x * b.x, a.y * b.y); }
static constexpr Point operator/(const Point& a, const Point& b) noexcept
{ return Point(a.x / b.x, a.y / b.y); }
static constexpr Point operator+(double a, const Point& b) noexcept
{ return Point(a + b.x, a + b.y); }
static constexpr Point operator-(double a, const Point& b) noexcept
{ return Point(a - b.x, a - b.y); }
static constexpr Point operator*(double a, const Point& b) noexcept
{ return Point(a * b.x, a * b.y); }
static constexpr Point operator/(double a, const Point& b) noexcept
{ return Point(a / b.x, a / b.y); }
// Min/Max - different semantics compared to std.
template<typename T> constexpr T myMin(const T& a, const T& b) noexcept
{ return b < a ? b : a; }
template<typename T> constexpr T myMax(const T& a, const T& b) noexcept
{ return a < b ? b : a; }
// Linear interpolation, works with points as well.
template<typename V, typename T = double>
inline V lerp(const V& a, const V& b, const T& t) noexcept {
return (a * (1.0 - t)) + (b * t);
}
// Merge a point into a box by possibly increasing its bounds.
inline void boxMergePoint(Box& box, const Point& p) noexcept {
box.x0 = myMin(box.x0, p.x);
box.y0 = myMin(box.y0, p.y);
box.x1 = myMax(box.x1, p.x);
box.y1 = myMax(box.y1, p.y);
}
void quadBoundingBoxA(const Point bez[3], Box& bBox) noexcept {
// Bounding box of start and end points.
bBox.reset(myMin(bez[0].x, bez[2].x), myMin(bez[0].y, bez[2].y),
myMax(bez[0].x, bez[2].x), myMax(bez[0].y, bez[2].y));
Point t = (bez[0] - bez[1]) / (bez[0] - bez[1] * 2.0 + bez[2]);
t.x = myMax(t.x, 0.0);
t.y = myMax(t.y, 0.0);
t.x = myMin(t.x, 1.0);
t.y = myMin(t.y, 1.0);
boxMergePoint(bBox, lerp(lerp(bez[0], bez[1], t),
lerp(bez[1], bez[2], t), t));
}
GCC Output [-std=c++17 -O3 -mavx2 -fno-math-errno]
--------------------------------------------------
quadBoundingBoxA(Point const*, Box&):
push rbp
mov rbp, rsp
and rsp, -32
vmovsd xmm1, QWORD PTR [rdi+8]
vmovsd xmm0, QWORD PTR [rdi]
vmovsd xmm5, QWORD PTR [rdi+40]
vmovsd xmm6, QWORD PTR [rdi+32]
vmaxsd xmm13, xmm5, xmm1
vmaxsd xmm12, xmm6, xmm0
vminsd xmm5, xmm5, xmm1
vminsd xmm6, xmm6, xmm0
vunpcklpd xmm0, xmm12, xmm13
vunpcklpd xmm1, xmm6, xmm5
vmovups XMMWORD PTR [rsi+16], xmm0
vmovups XMMWORD PTR [rsi], xmm1
vmovsd xmm2, QWORD PTR [rdi+24]
vmovsd xmm10, QWORD PTR [rdi+8]
vmovsd xmm1, QWORD PTR [rdi+40]
vmovsd xmm7, QWORD PTR [rdi+16]
vaddsd xmm4, xmm2, xmm2
vsubsd xmm9, xmm10, xmm2
vmovsd xmm3, QWORD PTR [rdi]
vmovsd xmm0, QWORD PTR [rdi+32]
vsubsd xmm8, xmm3, xmm7
vsubsd xmm4, xmm10, xmm4
vaddsd xmm4, xmm4, xmm1
vdivsd xmm9, xmm9, xmm4
vaddsd xmm4, xmm7, xmm7
vsubsd xmm4, xmm3, xmm4
vaddsd xmm4, xmm4, xmm0
vdivsd xmm8, xmm8, xmm4
vxorpd xmm4, xmm4, xmm4
vcomisd xmm4, xmm8
ja .L6
vcomisd xmm4, xmm9
jbe .L36
vmovsd xmm11, QWORD PTR .LC1[rip]
vmulsd xmm14, xmm1, xmm4
vmulsd xmm9, xmm2, xmm4
vcomisd xmm8, xmm11
jbe .L37
vmovsd QWORD PTR [rsp-16], xmm2
vmovapd xmm1, xmm14
vmovapd xmm2, xmm9
vxorpd xmm14, xmm14, xmm14
vmovsd QWORD PTR [rsp-8], xmm7
vmulsd xmm3, xmm3, xmm4
vmovapd xmm15, xmm11
vmovapd xmm8, xmm11
vmulsd xmm7, xmm7, xmm4
vxorpd xmm9, xmm9, xmm9
jmp .L13
.L6:
vmulsd xmm11, xmm7, xmm4
vcomisd xmm4, xmm9
vxorpd xmm8, xmm8, xmm8
vmulsd xmm0, xmm0, xmm4
vmovsd QWORD PTR [rsp-8], xmm11
vmovsd xmm11, QWORD PTR .LC1[rip]
vmovapd xmm14, xmm11
jbe .L10
.L19:
vmovsd QWORD PTR [rsp-16], xmm2
vmulsd xmm1, xmm1, xmm4
vmovapd xmm15, xmm11
vxorpd xmm9, xmm9, xmm9
vmulsd xmm2, xmm2, xmm4
jmp .L13
.L36:
vmovsd xmm11, QWORD PTR .LC1[rip]
vcomisd xmm8, xmm11
jbe .L29
vmovsd QWORD PTR [rsp-8], xmm7
vmulsd xmm3, xmm3, xmm4
vxorpd xmm14, xmm14, xmm14
vmovapd xmm8, xmm11
vmulsd xmm7, xmm7, xmm4
.L10:
vcomisd xmm9, xmm11
jbe .L30
vmulsd xmm15, xmm2, xmm4
vmovapd xmm9, xmm11
vmulsd xmm10, xmm10, xmm4
vmovsd QWORD PTR [rsp-16], xmm15
vxorpd xmm15, xmm15, xmm15
.L13:
vaddsd xmm1, xmm1, QWORD PTR [rsp-16]
vaddsd xmm3, xmm3, QWORD PTR [rsp-8]
vaddsd xmm2, xmm2, xmm10
vaddsd xmm0, xmm0, xmm7
vmulsd xmm9, xmm1, xmm9
vmulsd xmm15, xmm2, xmm15
vmulsd xmm8, xmm0, xmm8
vmulsd xmm14, xmm3, xmm14
vaddsd xmm9, xmm9, xmm15
vaddsd xmm14, xmm8, xmm14
vminsd xmm5, xmm9, xmm5
vmaxsd xmm9, xmm9, xmm13
vminsd xmm6, xmm14, xmm6
vmaxsd xmm14, xmm14, xmm12
vmovsd QWORD PTR [rsi+8], xmm5
vmovsd QWORD PTR [rsi+24], xmm9
vmovsd QWORD PTR [rsi], xmm6
vmovsd QWORD PTR [rsi+16], xmm14
leave
ret
.L29:
vmulsd xmm15, xmm7, xmm8
vsubsd xmm14, xmm11, xmm8
vmulsd xmm0, xmm0, xmm8
vmulsd xmm3, xmm3, xmm14
vmulsd xmm7, xmm7, xmm14
vmovsd QWORD PTR [rsp-8], xmm15
jmp .L10
.L37:
vmulsd xmm15, xmm7, xmm8
vsubsd xmm14, xmm11, xmm8
vmulsd xmm0, xmm0, xmm8
vmulsd xmm3, xmm3, xmm14
vmulsd xmm7, xmm7, xmm14
vmovsd QWORD PTR [rsp-8], xmm15
jmp .L19
.L30:
vsubsd xmm15, xmm11, xmm9
vmulsd xmm1, xmm1, xmm9
vmulsd xmm4, xmm2, xmm15
vmulsd xmm10, xmm10, xmm15
vmulsd xmm2, xmm2, xmm9
vmovsd QWORD PTR [rsp-16], xmm4
jmp .L13
Clang Output [-std=c++17 -O3 -mavx2 -fno-math-errno]
----------------------------------------------------
.LCPI0_0:
.quad 4607182418800017408 # double 1
.quad 4607182418800017408 # double 1
quadBoundingBoxA(Point const*, Box&): # @quadBoundingBoxA(Point const*,
Box&)
vmovupd xmm0, xmmword ptr [rdi]
vmovupd xmm1, xmmword ptr [rdi + 16]
vmovupd xmm2, xmmword ptr [rdi + 32]
vminpd xmm3, xmm2, xmm0
vmaxpd xmm4, xmm2, xmm0
vsubpd xmm5, xmm0, xmm1
vaddpd xmm6, xmm1, xmm1
vsubpd xmm6, xmm0, xmm6
vaddpd xmm6, xmm2, xmm6
vdivpd xmm5, xmm5, xmm6
vxorpd xmm6, xmm6, xmm6
vmaxpd xmm5, xmm6, xmm5
vmovapd xmm6, xmmword ptr [rip + .LCPI0_0] # xmm6 =
[1.000000e+00,1.000000e+00]
vminpd xmm5, xmm6, xmm5
vsubpd xmm6, xmm6, xmm5
vmulpd xmm0, xmm0, xmm6
vmulpd xmm7, xmm1, xmm5
vaddpd xmm0, xmm7, xmm0
vmulpd xmm1, xmm1, xmm6
vmulpd xmm2, xmm2, xmm5
vaddpd xmm1, xmm2, xmm1
vmulpd xmm0, xmm6, xmm0
vmulpd xmm1, xmm5, xmm1
vaddpd xmm0, xmm0, xmm1
vminpd xmm1, xmm0, xmm3
vmovupd xmmword ptr [rsi], xmm1
vmaxpd xmm0, xmm0, xmm4
vmovupd xmmword ptr [rsi + 16], xmm0
ret
More information about the Gcc-bugs
mailing list