[Bug c++/87105] New: Autovectorization [X86, SSE2, AVX2, DoublePrecision]

Sat Aug 25 22:58:00 GMT 2018

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87105

            Bug ID: 87105
           Summary: Autovectorization [X86, SSE2, AVX2, DoublePrecision]
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: kobalicek.petr at gmail dot com
  Target Milestone: ---

GCC is unable to autovectorize the following code. It seems that it doesn't
like min/max, but I'm not entirely sure. I stripped the code off my project so
it's a bit longer, hope that's fine. I attached also a code compiled by clang,
which is perfectly vectorized and what I would like to get from GCC.

The demonstration code
----------------------

#include <algorithm>
#include <cmath>
#include <stdint.h>

// Point structure [x, y]
struct Point {
  double x, y;

  inline Point() noexcept = default;
  constexpr Point(const Point&) noexcept = default;

  constexpr Point(double x, double y) noexcept
    : x(x), y(y) {}
};

// Box structure [x0, y0, x1, y1]
struct Box {
  double x0, y0, x1, y1;

  inline void reset(double x0, double y0, double x1, double y1) noexcept {
    this->x0 = x0;
    this->y0 = y0;
    this->x1 = x1;
    this->y1 = y1;
  }
};

// Overloads to make vector processing simpler.
static constexpr Point operator-(const Point& a) noexcept { return Point(-a.x,
-a.y); }

static constexpr Point operator+(const Point& a, double b) noexcept
{ return Point(a.x + b, a.y + b); }
static constexpr Point operator-(const Point& a, double b) noexcept
{ return Point(a.x - b, a.y - b); }
static constexpr Point operator*(const Point& a, double b) noexcept
{ return Point(a.x * b, a.y * b); }
static constexpr Point operator/(const Point& a, double b) noexcept
{ return Point(a.x / b, a.y / b); }

static constexpr Point operator+(const Point& a, const Point& b) noexcept
{ return Point(a.x + b.x, a.y + b.y); }
static constexpr Point operator-(const Point& a, const Point& b) noexcept
{ return Point(a.x - b.x, a.y - b.y); }
static constexpr Point operator*(const Point& a, const Point& b) noexcept
{ return Point(a.x * b.x, a.y * b.y); }
static constexpr Point operator/(const Point& a, const Point& b) noexcept
{ return Point(a.x / b.x, a.y / b.y); }

static constexpr Point operator+(double a, const Point& b) noexcept
{ return Point(a + b.x, a + b.y); }
static constexpr Point operator-(double a, const Point& b) noexcept
{ return Point(a - b.x, a - b.y); }
static constexpr Point operator*(double a, const Point& b) noexcept
{ return Point(a * b.x, a * b.y); }
static constexpr Point operator/(double a, const Point& b) noexcept
{ return Point(a / b.x, a / b.y); }

// Min/Max - different semantics compared to std.
template<typename T> constexpr T myMin(const T& a, const T& b) noexcept
{ return b < a ? b : a; }
template<typename T> constexpr T myMax(const T& a, const T& b) noexcept
{ return a < b ? b : a; }

// Linear interpolation, works with points as well.
template<typename V, typename T = double>
inline V lerp(const V& a, const V& b, const T& t) noexcept {
  return (a * (1.0 - t)) + (b * t);
}

// Merge a point into a box by possibly increasing its bounds.
inline void boxMergePoint(Box& box, const Point& p) noexcept {
  box.x0 = myMin(box.x0, p.x);
  box.y0 = myMin(box.y0, p.y);
  box.x1 = myMax(box.x1, p.x);
  box.y1 = myMax(box.y1, p.y);
}

void quadBoundingBoxA(const Point bez[3], Box& bBox) noexcept {
  // Bounding box of start and end points.
  bBox.reset(myMin(bez[0].x, bez[2].x), myMin(bez[0].y, bez[2].y),
             myMax(bez[0].x, bez[2].x), myMax(bez[0].y, bez[2].y));

  Point t = (bez[0] - bez[1]) / (bez[0] - bez[1] * 2.0 + bez[2]);

  t.x = myMax(t.x, 0.0);
  t.y = myMax(t.y, 0.0);
  t.x = myMin(t.x, 1.0);
  t.y = myMin(t.y, 1.0);

  boxMergePoint(bBox, lerp(lerp(bez[0], bez[1], t),
                               lerp(bez[1], bez[2], t), t));
}

GCC Output [-std=c++17 -O3 -mavx2 -fno-math-errno]
--------------------------------------------------

quadBoundingBoxA(Point const*, Box&):
        push    rbp
        mov     rbp, rsp
        and     rsp, -32
        vmovsd  xmm1, QWORD PTR [rdi+8]
        vmovsd  xmm0, QWORD PTR [rdi]
        vmovsd  xmm5, QWORD PTR [rdi+40]
        vmovsd  xmm6, QWORD PTR [rdi+32]
        vmaxsd  xmm13, xmm5, xmm1
        vmaxsd  xmm12, xmm6, xmm0
        vminsd  xmm5, xmm5, xmm1
        vminsd  xmm6, xmm6, xmm0
        vunpcklpd       xmm0, xmm12, xmm13
        vunpcklpd       xmm1, xmm6, xmm5
        vmovups XMMWORD PTR [rsi+16], xmm0
        vmovups XMMWORD PTR [rsi], xmm1
        vmovsd  xmm2, QWORD PTR [rdi+24]
        vmovsd  xmm10, QWORD PTR [rdi+8]
        vmovsd  xmm1, QWORD PTR [rdi+40]
        vmovsd  xmm7, QWORD PTR [rdi+16]
        vaddsd  xmm4, xmm2, xmm2
        vsubsd  xmm9, xmm10, xmm2
        vmovsd  xmm3, QWORD PTR [rdi]
        vmovsd  xmm0, QWORD PTR [rdi+32]
        vsubsd  xmm8, xmm3, xmm7
        vsubsd  xmm4, xmm10, xmm4
        vaddsd  xmm4, xmm4, xmm1
        vdivsd  xmm9, xmm9, xmm4
        vaddsd  xmm4, xmm7, xmm7
        vsubsd  xmm4, xmm3, xmm4
        vaddsd  xmm4, xmm4, xmm0
        vdivsd  xmm8, xmm8, xmm4
        vxorpd  xmm4, xmm4, xmm4
        vcomisd xmm4, xmm8
        ja      .L6
        vcomisd xmm4, xmm9
        jbe     .L36
        vmovsd  xmm11, QWORD PTR .LC1[rip]
        vmulsd  xmm14, xmm1, xmm4
        vmulsd  xmm9, xmm2, xmm4
        vcomisd xmm8, xmm11
        jbe     .L37
        vmovsd  QWORD PTR [rsp-16], xmm2
        vmovapd xmm1, xmm14
        vmovapd xmm2, xmm9
        vxorpd  xmm14, xmm14, xmm14
        vmovsd  QWORD PTR [rsp-8], xmm7
        vmulsd  xmm3, xmm3, xmm4
        vmovapd xmm15, xmm11
        vmovapd xmm8, xmm11
        vmulsd  xmm7, xmm7, xmm4
        vxorpd  xmm9, xmm9, xmm9
        jmp     .L13
.L6:
        vmulsd  xmm11, xmm7, xmm4
        vcomisd xmm4, xmm9
        vxorpd  xmm8, xmm8, xmm8
        vmulsd  xmm0, xmm0, xmm4
        vmovsd  QWORD PTR [rsp-8], xmm11
        vmovsd  xmm11, QWORD PTR .LC1[rip]
        vmovapd xmm14, xmm11
        jbe     .L10
.L19:
        vmovsd  QWORD PTR [rsp-16], xmm2
        vmulsd  xmm1, xmm1, xmm4
        vmovapd xmm15, xmm11
        vxorpd  xmm9, xmm9, xmm9
        vmulsd  xmm2, xmm2, xmm4
        jmp     .L13
.L36:
        vmovsd  xmm11, QWORD PTR .LC1[rip]
        vcomisd xmm8, xmm11
        jbe     .L29
        vmovsd  QWORD PTR [rsp-8], xmm7
        vmulsd  xmm3, xmm3, xmm4
        vxorpd  xmm14, xmm14, xmm14
        vmovapd xmm8, xmm11
        vmulsd  xmm7, xmm7, xmm4
.L10:
        vcomisd xmm9, xmm11
        jbe     .L30
        vmulsd  xmm15, xmm2, xmm4
        vmovapd xmm9, xmm11
        vmulsd  xmm10, xmm10, xmm4
        vmovsd  QWORD PTR [rsp-16], xmm15
        vxorpd  xmm15, xmm15, xmm15
.L13:
        vaddsd  xmm1, xmm1, QWORD PTR [rsp-16]
        vaddsd  xmm3, xmm3, QWORD PTR [rsp-8]
        vaddsd  xmm2, xmm2, xmm10
        vaddsd  xmm0, xmm0, xmm7
        vmulsd  xmm9, xmm1, xmm9
        vmulsd  xmm15, xmm2, xmm15
        vmulsd  xmm8, xmm0, xmm8
        vmulsd  xmm14, xmm3, xmm14
        vaddsd  xmm9, xmm9, xmm15
        vaddsd  xmm14, xmm8, xmm14
        vminsd  xmm5, xmm9, xmm5
        vmaxsd  xmm9, xmm9, xmm13
        vminsd  xmm6, xmm14, xmm6
        vmaxsd  xmm14, xmm14, xmm12
        vmovsd  QWORD PTR [rsi+8], xmm5
        vmovsd  QWORD PTR [rsi+24], xmm9
        vmovsd  QWORD PTR [rsi], xmm6
        vmovsd  QWORD PTR [rsi+16], xmm14
        leave
        ret
.L29:
        vmulsd  xmm15, xmm7, xmm8
        vsubsd  xmm14, xmm11, xmm8
        vmulsd  xmm0, xmm0, xmm8
        vmulsd  xmm3, xmm3, xmm14
        vmulsd  xmm7, xmm7, xmm14
        vmovsd  QWORD PTR [rsp-8], xmm15
        jmp     .L10
.L37:
        vmulsd  xmm15, xmm7, xmm8
        vsubsd  xmm14, xmm11, xmm8
        vmulsd  xmm0, xmm0, xmm8
        vmulsd  xmm3, xmm3, xmm14
        vmulsd  xmm7, xmm7, xmm14
        vmovsd  QWORD PTR [rsp-8], xmm15
        jmp     .L19
.L30:
        vsubsd  xmm15, xmm11, xmm9
        vmulsd  xmm1, xmm1, xmm9
        vmulsd  xmm4, xmm2, xmm15
        vmulsd  xmm10, xmm10, xmm15
        vmulsd  xmm2, xmm2, xmm9
        vmovsd  QWORD PTR [rsp-16], xmm4
        jmp     .L13

Clang Output [-std=c++17 -O3 -mavx2 -fno-math-errno]
----------------------------------------------------

.LCPI0_0:
        .quad   4607182418800017408     # double 1
        .quad   4607182418800017408     # double 1
quadBoundingBoxA(Point const*, Box&):      # @quadBoundingBoxA(Point const*,
Box&)
        vmovupd xmm0, xmmword ptr [rdi]
        vmovupd xmm1, xmmword ptr [rdi + 16]
        vmovupd xmm2, xmmword ptr [rdi + 32]
        vminpd  xmm3, xmm2, xmm0
        vmaxpd  xmm4, xmm2, xmm0
        vsubpd  xmm5, xmm0, xmm1
        vaddpd  xmm6, xmm1, xmm1
        vsubpd  xmm6, xmm0, xmm6
        vaddpd  xmm6, xmm2, xmm6
        vdivpd  xmm5, xmm5, xmm6
        vxorpd  xmm6, xmm6, xmm6
        vmaxpd  xmm5, xmm6, xmm5
        vmovapd xmm6, xmmword ptr [rip + .LCPI0_0] # xmm6 =
[1.000000e+00,1.000000e+00]
        vminpd  xmm5, xmm6, xmm5
        vsubpd  xmm6, xmm6, xmm5
        vmulpd  xmm0, xmm0, xmm6
        vmulpd  xmm7, xmm1, xmm5
        vaddpd  xmm0, xmm7, xmm0
        vmulpd  xmm1, xmm1, xmm6
        vmulpd  xmm2, xmm2, xmm5
        vaddpd  xmm1, xmm2, xmm1
        vmulpd  xmm0, xmm6, xmm0
        vmulpd  xmm1, xmm5, xmm1
        vaddpd  xmm0, xmm0, xmm1
        vminpd  xmm1, xmm0, xmm3
        vmovupd xmmword ptr [rsi], xmm1
        vmaxpd  xmm0, xmm0, xmm4
        vmovupd xmmword ptr [rsi + 16], xmm0
        ret