Bug 87105 - Autovectorization [X86, SSE2, AVX2, DoublePrecision]
Summary: Autovectorization [X86, SSE2, AVX2, DoublePrecision]
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 8.2.1
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: alias, missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2018-08-25 22:58 UTC by Petr
Modified: 2018-10-26 12:24 UTC (History)
5 users (show)

See Also:
Host:
Target: x86_64-*-* i?86-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2018-08-27 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Petr 2018-08-25 22:58:42 UTC
GCC is unable to autovectorize the following code. It seems that it doesn't like min/max, but I'm not entirely sure. I stripped the code off my project so it's a bit longer, hope that's fine. I attached also a code compiled by clang, which is perfectly vectorized and what I would like to get from GCC.


The demonstration code
----------------------

#include <algorithm>
#include <cmath>
#include <stdint.h>

// Point structure [x, y]
struct Point {
  double x, y;

  inline Point() noexcept = default;
  constexpr Point(const Point&) noexcept = default;

  constexpr Point(double x, double y) noexcept
    : x(x), y(y) {}
};

// Box structure [x0, y0, x1, y1]
struct Box {
  double x0, y0, x1, y1;

  inline void reset(double x0, double y0, double x1, double y1) noexcept {
    this->x0 = x0;
    this->y0 = y0;
    this->x1 = x1;
    this->y1 = y1;
  }
};

// Overloads to make vector processing simpler.
static constexpr Point operator-(const Point& a) noexcept { return Point(-a.x, -a.y); }

static constexpr Point operator+(const Point& a, double b) noexcept
{ return Point(a.x + b, a.y + b); }
static constexpr Point operator-(const Point& a, double b) noexcept
{ return Point(a.x - b, a.y - b); }
static constexpr Point operator*(const Point& a, double b) noexcept
{ return Point(a.x * b, a.y * b); }
static constexpr Point operator/(const Point& a, double b) noexcept
{ return Point(a.x / b, a.y / b); }

static constexpr Point operator+(const Point& a, const Point& b) noexcept
{ return Point(a.x + b.x, a.y + b.y); }
static constexpr Point operator-(const Point& a, const Point& b) noexcept
{ return Point(a.x - b.x, a.y - b.y); }
static constexpr Point operator*(const Point& a, const Point& b) noexcept
{ return Point(a.x * b.x, a.y * b.y); }
static constexpr Point operator/(const Point& a, const Point& b) noexcept
{ return Point(a.x / b.x, a.y / b.y); }

static constexpr Point operator+(double a, const Point& b) noexcept
{ return Point(a + b.x, a + b.y); }
static constexpr Point operator-(double a, const Point& b) noexcept
{ return Point(a - b.x, a - b.y); }
static constexpr Point operator*(double a, const Point& b) noexcept
{ return Point(a * b.x, a * b.y); }
static constexpr Point operator/(double a, const Point& b) noexcept
{ return Point(a / b.x, a / b.y); }

// Min/Max - different semantics compared to std.
template<typename T> constexpr T myMin(const T& a, const T& b) noexcept
{ return b < a ? b : a; }
template<typename T> constexpr T myMax(const T& a, const T& b) noexcept
{ return a < b ? b : a; }

// Linear interpolation, works with points as well.
template<typename V, typename T = double>
inline V lerp(const V& a, const V& b, const T& t) noexcept {
  return (a * (1.0 - t)) + (b * t);
}

// Merge a point into a box by possibly increasing its bounds.
inline void boxMergePoint(Box& box, const Point& p) noexcept {
  box.x0 = myMin(box.x0, p.x);
  box.y0 = myMin(box.y0, p.y);
  box.x1 = myMax(box.x1, p.x);
  box.y1 = myMax(box.y1, p.y);
}

void quadBoundingBoxA(const Point bez[3], Box& bBox) noexcept {
  // Bounding box of start and end points.
  bBox.reset(myMin(bez[0].x, bez[2].x), myMin(bez[0].y, bez[2].y),
             myMax(bez[0].x, bez[2].x), myMax(bez[0].y, bez[2].y));

  Point t = (bez[0] - bez[1]) / (bez[0] - bez[1] * 2.0 + bez[2]);

  t.x = myMax(t.x, 0.0);
  t.y = myMax(t.y, 0.0);
  t.x = myMin(t.x, 1.0);
  t.y = myMin(t.y, 1.0);

  boxMergePoint(bBox, lerp(lerp(bez[0], bez[1], t),
                               lerp(bez[1], bez[2], t), t));
}




GCC Output [-std=c++17 -O3 -mavx2 -fno-math-errno]
--------------------------------------------------

quadBoundingBoxA(Point const*, Box&):
        push    rbp
        mov     rbp, rsp
        and     rsp, -32
        vmovsd  xmm1, QWORD PTR [rdi+8]
        vmovsd  xmm0, QWORD PTR [rdi]
        vmovsd  xmm5, QWORD PTR [rdi+40]
        vmovsd  xmm6, QWORD PTR [rdi+32]
        vmaxsd  xmm13, xmm5, xmm1
        vmaxsd  xmm12, xmm6, xmm0
        vminsd  xmm5, xmm5, xmm1
        vminsd  xmm6, xmm6, xmm0
        vunpcklpd       xmm0, xmm12, xmm13
        vunpcklpd       xmm1, xmm6, xmm5
        vmovups XMMWORD PTR [rsi+16], xmm0
        vmovups XMMWORD PTR [rsi], xmm1
        vmovsd  xmm2, QWORD PTR [rdi+24]
        vmovsd  xmm10, QWORD PTR [rdi+8]
        vmovsd  xmm1, QWORD PTR [rdi+40]
        vmovsd  xmm7, QWORD PTR [rdi+16]
        vaddsd  xmm4, xmm2, xmm2
        vsubsd  xmm9, xmm10, xmm2
        vmovsd  xmm3, QWORD PTR [rdi]
        vmovsd  xmm0, QWORD PTR [rdi+32]
        vsubsd  xmm8, xmm3, xmm7
        vsubsd  xmm4, xmm10, xmm4
        vaddsd  xmm4, xmm4, xmm1
        vdivsd  xmm9, xmm9, xmm4
        vaddsd  xmm4, xmm7, xmm7
        vsubsd  xmm4, xmm3, xmm4
        vaddsd  xmm4, xmm4, xmm0
        vdivsd  xmm8, xmm8, xmm4
        vxorpd  xmm4, xmm4, xmm4
        vcomisd xmm4, xmm8
        ja      .L6
        vcomisd xmm4, xmm9
        jbe     .L36
        vmovsd  xmm11, QWORD PTR .LC1[rip]
        vmulsd  xmm14, xmm1, xmm4
        vmulsd  xmm9, xmm2, xmm4
        vcomisd xmm8, xmm11
        jbe     .L37
        vmovsd  QWORD PTR [rsp-16], xmm2
        vmovapd xmm1, xmm14
        vmovapd xmm2, xmm9
        vxorpd  xmm14, xmm14, xmm14
        vmovsd  QWORD PTR [rsp-8], xmm7
        vmulsd  xmm3, xmm3, xmm4
        vmovapd xmm15, xmm11
        vmovapd xmm8, xmm11
        vmulsd  xmm7, xmm7, xmm4
        vxorpd  xmm9, xmm9, xmm9
        jmp     .L13
.L6:
        vmulsd  xmm11, xmm7, xmm4
        vcomisd xmm4, xmm9
        vxorpd  xmm8, xmm8, xmm8
        vmulsd  xmm0, xmm0, xmm4
        vmovsd  QWORD PTR [rsp-8], xmm11
        vmovsd  xmm11, QWORD PTR .LC1[rip]
        vmovapd xmm14, xmm11
        jbe     .L10
.L19:
        vmovsd  QWORD PTR [rsp-16], xmm2
        vmulsd  xmm1, xmm1, xmm4
        vmovapd xmm15, xmm11
        vxorpd  xmm9, xmm9, xmm9
        vmulsd  xmm2, xmm2, xmm4
        jmp     .L13
.L36:
        vmovsd  xmm11, QWORD PTR .LC1[rip]
        vcomisd xmm8, xmm11
        jbe     .L29
        vmovsd  QWORD PTR [rsp-8], xmm7
        vmulsd  xmm3, xmm3, xmm4
        vxorpd  xmm14, xmm14, xmm14
        vmovapd xmm8, xmm11
        vmulsd  xmm7, xmm7, xmm4
.L10:
        vcomisd xmm9, xmm11
        jbe     .L30
        vmulsd  xmm15, xmm2, xmm4
        vmovapd xmm9, xmm11
        vmulsd  xmm10, xmm10, xmm4
        vmovsd  QWORD PTR [rsp-16], xmm15
        vxorpd  xmm15, xmm15, xmm15
.L13:
        vaddsd  xmm1, xmm1, QWORD PTR [rsp-16]
        vaddsd  xmm3, xmm3, QWORD PTR [rsp-8]
        vaddsd  xmm2, xmm2, xmm10
        vaddsd  xmm0, xmm0, xmm7
        vmulsd  xmm9, xmm1, xmm9
        vmulsd  xmm15, xmm2, xmm15
        vmulsd  xmm8, xmm0, xmm8
        vmulsd  xmm14, xmm3, xmm14
        vaddsd  xmm9, xmm9, xmm15
        vaddsd  xmm14, xmm8, xmm14
        vminsd  xmm5, xmm9, xmm5
        vmaxsd  xmm9, xmm9, xmm13
        vminsd  xmm6, xmm14, xmm6
        vmaxsd  xmm14, xmm14, xmm12
        vmovsd  QWORD PTR [rsi+8], xmm5
        vmovsd  QWORD PTR [rsi+24], xmm9
        vmovsd  QWORD PTR [rsi], xmm6
        vmovsd  QWORD PTR [rsi+16], xmm14
        leave
        ret
.L29:
        vmulsd  xmm15, xmm7, xmm8
        vsubsd  xmm14, xmm11, xmm8
        vmulsd  xmm0, xmm0, xmm8
        vmulsd  xmm3, xmm3, xmm14
        vmulsd  xmm7, xmm7, xmm14
        vmovsd  QWORD PTR [rsp-8], xmm15
        jmp     .L10
.L37:
        vmulsd  xmm15, xmm7, xmm8
        vsubsd  xmm14, xmm11, xmm8
        vmulsd  xmm0, xmm0, xmm8
        vmulsd  xmm3, xmm3, xmm14
        vmulsd  xmm7, xmm7, xmm14
        vmovsd  QWORD PTR [rsp-8], xmm15
        jmp     .L19
.L30:
        vsubsd  xmm15, xmm11, xmm9
        vmulsd  xmm1, xmm1, xmm9
        vmulsd  xmm4, xmm2, xmm15
        vmulsd  xmm10, xmm10, xmm15
        vmulsd  xmm2, xmm2, xmm9
        vmovsd  QWORD PTR [rsp-16], xmm4
        jmp     .L13




Clang Output [-std=c++17 -O3 -mavx2 -fno-math-errno]
----------------------------------------------------

.LCPI0_0:
        .quad   4607182418800017408     # double 1
        .quad   4607182418800017408     # double 1
quadBoundingBoxA(Point const*, Box&):      # @quadBoundingBoxA(Point const*, Box&)
        vmovupd xmm0, xmmword ptr [rdi]
        vmovupd xmm1, xmmword ptr [rdi + 16]
        vmovupd xmm2, xmmword ptr [rdi + 32]
        vminpd  xmm3, xmm2, xmm0
        vmaxpd  xmm4, xmm2, xmm0
        vsubpd  xmm5, xmm0, xmm1
        vaddpd  xmm6, xmm1, xmm1
        vsubpd  xmm6, xmm0, xmm6
        vaddpd  xmm6, xmm2, xmm6
        vdivpd  xmm5, xmm5, xmm6
        vxorpd  xmm6, xmm6, xmm6
        vmaxpd  xmm5, xmm6, xmm5
        vmovapd xmm6, xmmword ptr [rip + .LCPI0_0] # xmm6 = [1.000000e+00,1.000000e+00]
        vminpd  xmm5, xmm6, xmm5
        vsubpd  xmm6, xmm6, xmm5
        vmulpd  xmm0, xmm0, xmm6
        vmulpd  xmm7, xmm1, xmm5
        vaddpd  xmm0, xmm7, xmm0
        vmulpd  xmm1, xmm1, xmm6
        vmulpd  xmm2, xmm2, xmm5
        vaddpd  xmm1, xmm2, xmm1
        vmulpd  xmm0, xmm6, xmm0
        vmulpd  xmm1, xmm5, xmm1
        vaddpd  xmm0, xmm0, xmm1
        vminpd  xmm1, xmm0, xmm3
        vmovupd xmmword ptr [rsi], xmm1
        vmaxpd  xmm0, xmm0, xmm4
        vmovupd xmmword ptr [rsi + 16], xmm0
        ret
Comment 1 Andrew Pinski 2018-08-26 00:56:23 UTC
So I think it is an interesting interaction in that GCC cannot change a < b ? a : b into MIN_EXPR.  There might be a reasoning behind this, dealing with NaNs, INF, etc.
Comment 2 Andrew Pinski 2018-08-26 01:09:59 UTC
One more point is in C++ (a < b ? b : a) is a lvalue which might also interfer with converting it into min/max.
Comment 3 Marc Glisse 2018-08-26 07:23:02 UTC
With -ffast-math we (awkwardly) vectorize a couple min/max at the beginning, but clearly not the whole thing like llvm.
Comment 4 Petr 2018-08-26 08:22:42 UTC
I think this code is vectorizable without --fast-math. However, it seems that once a min/max (or something else) is kept scalar it poisons the rest of the code.

The following code works perfectly (scalar):

```
#include <algorithm>

template<typename T> constexpr T altMinT(const T& a, const T& b) noexcept
{ return b < a ? b : a; }
template<typename T> constexpr T altMaxT(const T& a, const T& b) noexcept
{ return a < b ? b : a; }

double std_min(double a, double b) { return std::min(a, b); }
double std_max(double a, double b) { return std::max(a, b); }

double alt_min(double a, double b) { return altMinT(a, b); }
double alt_max(double a, double b) { return altMaxT(a, b); }

```

I think that's the main problem - little samples are optimized well, complex code isn't.
Comment 5 Alexander Monakov 2018-08-26 08:51:42 UTC
We don't manage to use MIN_EXPR and get branchy code that gets optimized for boundary cases (when min/max pick 0.0/1.0). For the minimal input

double f(double x, double y)
{
   return y < x ? y : x;
}

we get MIN_EXPR in .gimple dump with C frontend and -ffinite-math-only -fno-signed-zeros, but not with C++ frontend.

(and even then, hacking the source to use __builtin_fmin to get straight-line code, SLP does not manage to vectorize that)
Comment 6 Petr 2018-08-26 15:19:07 UTC
I think the test-case can even be simplified to something like this:

#include <algorithm>
#include <cmath>

struct Point {
  double x, y;

  void reset(double x, double y) {
    this->x = x;
    this->y = y;
  }
};

void f1(Point* p, Point* a) {
  p->reset(std::max(std::sqrt(p->x), a->x),
           std::max(std::sqrt(p->y), a->y));
}

GCC is unable to vectorize it:

  [-std=c++17 -O3 -mavx2 -fno-math-errno]
  f1(Point*, Point*):
    vsqrtsd xmm0, xmm0, QWORD PTR [rdi+8]
    vmovsd  xmm1, QWORD PTR [rsi+8]
    vsqrtsd xmm2, xmm2, QWORD PTR [rdi]
    vmaxsd  xmm1, xmm1, xmm0
    vmovsd  xmm0, QWORD PTR [rsi]
    vmaxsd  xmm0, xmm0, xmm2
    vunpcklpd       xmm0, xmm0, xmm1
    vmovups XMMWORD PTR [rdi], xmm0
    ret

whereas clang can:

  [-std=c++17 -O3 -mavx2 -fno-math-errno]
  f1(Point*, Point*):
    vsqrtpd xmm0, xmmword ptr [rdi]
    vmovupd xmm1, xmmword ptr [rsi]
    vmaxpd  xmm0, xmm1, xmm0
    vmovupd xmmword ptr [rdi], xmm0
    ret

I think this is a much simpler test-case to start with.
Comment 7 Richard Biener 2018-08-27 07:44:56 UTC
(In reply to Petr from comment #6)
> I think the test-case can even be simplified to something like this:
> 
> #include <algorithm>
> #include <cmath>
> 
> struct Point {
>   double x, y;
> 
>   void reset(double x, double y) {
>     this->x = x;
>     this->y = y;
>   }
> };
> 
> void f1(Point* p, Point* a) {
>   p->reset(std::max(std::sqrt(p->x), a->x),
>            std::max(std::sqrt(p->y), a->y));
> }
> 
> GCC is unable to vectorize it:
> 
>   [-std=c++17 -O3 -mavx2 -fno-math-errno]
>   f1(Point*, Point*):
>     vsqrtsd xmm0, xmm0, QWORD PTR [rdi+8]
>     vmovsd  xmm1, QWORD PTR [rsi+8]
>     vsqrtsd xmm2, xmm2, QWORD PTR [rdi]
>     vmaxsd  xmm1, xmm1, xmm0
>     vmovsd  xmm0, QWORD PTR [rsi]
>     vmaxsd  xmm0, xmm0, xmm2
>     vunpcklpd       xmm0, xmm0, xmm1
>     vmovups XMMWORD PTR [rdi], xmm0
>     ret
> 
> whereas clang can:
> 
>   [-std=c++17 -O3 -mavx2 -fno-math-errno]
>   f1(Point*, Point*):
>     vsqrtpd xmm0, xmmword ptr [rdi]
>     vmovupd xmm1, xmmword ptr [rsi]
>     vmaxpd  xmm0, xmm1, xmm0
>     vmovupd xmmword ptr [rdi], xmm0
>     ret
> 
> I think this is a much simpler test-case to start with.

With -Ofast -mavx2 I get

_Z2f1P5PointS0_:
.LFB1123:
        .cfi_startproc
        vsqrtpd (%rdi), %xmm0
        vmaxpd  (%rsi), %xmm0, %xmm0
        vmovups %xmm0, (%rdi)
        ret

as said elsewhere min/max recognition is the issue here and

  return a < b ? b : a; 

isn't the same as fmax or MAX_EXPR without further constraints.


For the original testcase GCC thinks some vectorization isn't profitable
and even with -Ofast GCC manages to trick itself in a corner where
MIN/MAX_EXPR detection fails to produce straight-line code.  C++
abstraction doesn't help here - we inline the min/max functions before
post-inline passes that would have cleaned up them nicely.  In particular
phiopt is run a bit late and is confused bu jump threading we perform in VRP.
-fno-tree-vrp -Ofast -mavx2 generates straight-line non-vectorized code
which the vectorizer thinks is not profitable to vectorize.

In the end we are confused by the redundant stores (the vectorizer has
some too simplified code to handle this):

  <bb 2> [local count: 1073741825]:
  _12 = MEM[(const double &)bez_1(D) + 8];
  _13 = MEM[(const double &)bez_1(D) + 40];
  iftmp.0_75 = MAX_EXPR <_12, _13>;
  _14 = MEM[(const double &)bez_1(D)];
  _15 = MEM[(const double &)bez_1(D) + 32];
  iftmp.0_74 = MAX_EXPR <_14, _15>;
  iftmp.1_73 = MIN_EXPR <_12, _13>;
  iftmp.1_72 = MIN_EXPR <_14, _15>;
  bBox_4(D)->x0 = iftmp.1_72;
  bBox_4(D)->y0 = iftmp.1_73;
  bBox_4(D)->x1 = iftmp.0_74;
  bBox_4(D)->y1 = iftmp.0_75;

we vectorize up to here with -fno-vect-cost-model

  _6 = MEM[(double *)bez_1(D) + 16B];
  _7 = MEM[(double *)bez_1(D) + 24B];
  _50 = _7 * 2.0e+0;
  _51 = _6 * 2.0e+0;
  _10 = MEM[(double *)bez_1(D)];
  _11 = MEM[(double *)bez_1(D) + 8B];
  _8 = MEM[(double *)bez_1(D) + 32B];
  _3 = _8 + _10;
  _9 = MEM[(double *)bez_1(D) + 40B];
  _5 = _9 + _11;
  _46 = _5 - _50;
  _47 = _3 - _51;
  _44 = _11 - _7;
...

the stores to bBox_4 alias the loads from bez_1 which due to C++ abstraction
from the min/max functions are just double * loads from TBAA perspective.
Given there are no visible stores to bez_1 we cannot assume any containing
dynamic type here.

...
  iftmp.1_67 = MIN_EXPR <_23, iftmp.1_72>;
  bBox_4(D)->x0 = iftmp.1_67;
  iftmp.1_66 = MIN_EXPR <_22, iftmp.1_73>;
  bBox_4(D)->y0 = iftmp.1_66;
  iftmp.0_65 = MAX_EXPR <_23, iftmp.0_74>;
  bBox_4(D)->x1 = iftmp.0_65;
  iftmp.0_64 = MAX_EXPR <_22, iftmp.0_75>;
  bBox_4(D)->y1 = iftmp.0_64;
  return;

Lifting the BB vectorization restriction is on my todo list:

bool
vect_analyze_data_ref_accesses (vec_info *vinfo)
{
...
          /* Do not place the same access in the interleaving chain twice.  */
          if (init_b == init_prev)
            {
              gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
                          < gimple_uid (DR_STMT (drb)));
              /* ???  For now we simply "drop" the later reference which is
                 otherwise the same rather than finishing off this group.
                 In the end we'd want to re-process duplicates forming
                 multiple groups from the refs, likely by just collecting
                 all candidates (including duplicates and split points
                 below) in a vector and then process them together.  */
              continue;
            }
Comment 8 Richard Biener 2018-08-27 07:46:17 UTC
I've also had patches adding an early phiopt pass which would have solved the CFG mess VRP creates.
Comment 9 Richard Biener 2018-10-23 08:44:11 UTC
For the original testcase there's also the issue that the bez[] accesses end up
as

  _68 = MEM[(double *)bez_27(D) + 16B];
  _69 = MEM[(double *)bez_27(D) + 24B];

and thus they possibly alias with the earlier stores

  bBox_33(D)->x0 = iftmp.1_190;
  bBox_33(D)->y0 = iftmp.1_193;
  bBox_33(D)->x1 = iftmp.0_196;
  bBox_33(D)->y1 = iftmp.0_199;

which we subsequently fail to eliminate against the final ones

  iftmp.1_49 = MIN_EXPR <_119, iftmp.1_190>;
  bBox_33(D)->x0 = iftmp.1_49;
  iftmp.1_31 = MIN_EXPR <_118, iftmp.1_193>;
  bBox_33(D)->y0 = iftmp.1_31;
  iftmp.0_102 = MAX_EXPR <_119, iftmp.0_196>;
  bBox_33(D)->x1 = iftmp.0_102;
  iftmp.0_105 = MAX_EXPR <_118, iftmp.0_199>;
  bBox_33(D)->y1 = iftmp.0_105;

if we'd eliminate the earlier ones then vectorization would have had a chance
here.  That's from

// Linear interpolation, works with points as well.
template<typename V, typename T = double>
inline V lerp(const V& a, const V& b, const T& t) noexcept {
  return (a * (1.0 - t)) + (b * t);
}

and

// Min/Max - different semantics compared to std.
template<typename T> constexpr T myMin(const T& a, const T& b) noexcept
{ return b < a ? b : a; }
template<typename T> constexpr T myMax(const T& a, const T& b) noexcept
{ return a < b ? b : a; }

taking references to double.

Plus it is because IPA SRA decomposing the by reference Point passing
to

  _17 = MEM[(double *)b_2(D)];
  _18 = MEM[(double *)b_2(D) + 8B];
  _19 = MEM[(double *)&t];
  _20 = MEM[(double *)&t + 8B];
  D.19260 = _ZmlRK5PointS1_.isra.5 (_17, _18, _19, _20);

that's quite bad for TBAA ... the function uses b->x, etc. to access the
memory.
Comment 10 Richard Biener 2018-10-23 11:35:28 UTC
Author: rguenth
Date: Tue Oct 23 11:34:56 2018
New Revision: 265421

URL: https://gcc.gnu.org/viewcvs?rev=265421&root=gcc&view=rev
Log:
2018-10-23  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/87105
	PR tree-optimization/87608
	* passes.def (pass_all_early_optimizations): Add early phi-opt
	after dce.
	* tree-ssa-phiopt.c (value_replacement): Ignore NOPs and predicts in
	addition to debug stmts.
	(tree_ssa_phiopt_worker): Add early_p argument, do only min/max
	and abs replacement early.
	* tree-cfg.c (gimple_empty_block_p): Likewise.

	* g++.dg/tree-ssa/phiopt-1.C: New testcase.
	g++.dg/vect/slp-pr87105.cc: Likewise.
	* g++.dg/tree-ssa/pr21463.C: Scan phiopt2 because this testcase
	relies on phiprop run before.
	* g++.dg/tree-ssa/pr30738.C: Likewise.
	* g++.dg/tree-ssa/pr57380.C: Likewise.
	* gcc.dg/tree-ssa/pr84859.c: Likewise.
	* gcc.dg/tree-ssa/pr45397.c: Scan phiopt2 because phiopt1 is
	confused by copies in the IL left by EVRP.
	* gcc.dg/tree-ssa/phi-opt-5.c: Likewise, this time confused
	by predictors.
	* gcc.dg/tree-ssa/phi-opt-12.c: Scan phiopt2.
	* gcc.dg/pr24574.c: Likewise.
	* g++.dg/tree-ssa/pr86544.C: Scan phiopt4.

Added:
    trunk/gcc/testsuite/g++.dg/tree-ssa/phiopt-1.C
    trunk/gcc/testsuite/g++.dg/vect/slp-pr87105.cc
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/passes.def
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/testsuite/g++.dg/tree-ssa/pr21463.C
    trunk/gcc/testsuite/g++.dg/tree-ssa/pr30738.C
    trunk/gcc/testsuite/g++.dg/tree-ssa/pr57380.C
    trunk/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
    trunk/gcc/testsuite/gcc.dg/pr24574.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/20040514-1.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/20040518-1.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-12.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-5.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-6.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-8.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/pr45397.c
    trunk/gcc/testsuite/gcc.dg/tree-ssa/pr84859.c
    trunk/gcc/testsuite/gcc.dg/uninit-15.c
    trunk/gcc/tree-cfg.c
    trunk/gcc/tree-ssa-phiopt.c
Comment 11 Richard Biener 2018-10-23 12:19:12 UTC
The code is now better but not vectorized due to mentioned issues.
Comment 12 Richard Biener 2018-10-24 09:05:10 UTC
With the duplicate store issue fixed in the vectorizer we run into the SLP vectorization issue that limits the growth of the SLP tree (yes, it's a tree
and thus tends to grow expontential easily...).
Comment 13 Richard Biener 2018-10-24 11:47:31 UTC
Author: rguenth
Date: Wed Oct 24 11:46:58 2018
New Revision: 265457

URL: https://gcc.gnu.org/viewcvs?rev=265457&root=gcc&view=rev
Log:
2018-10-24  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/87105
	* tree-vect-data-refs.c (vect_analyze_group_access_1): Adjust
	dump classification.
	(vect_analyze_data_ref_accesses): Handle duplicate loads and
	stores by splitting the affected group after the fact.
	* tree-vect-slp.c (vect_build_slp_tree_2): Dump when we
	fail the SLP build because of size constraints.

	* gcc.dg/vect/bb-slp-39.c: New testcase.
	* gfortran.dg/vect/pr83232.f90: Un-XFAIL.

Added:
    trunk/gcc/testsuite/gcc.dg/vect/bb-slp-39.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/testsuite/gfortran.dg/vect/pr83232.f90
    trunk/gcc/tree-vect-data-refs.c
    trunk/gcc/tree-vect-slp.c
Comment 14 Richard Biener 2018-10-26 07:39:35 UTC
Author: rguenth
Date: Fri Oct 26 07:38:59 2018
New Revision: 265522

URL: https://gcc.gnu.org/viewcvs?rev=265522&root=gcc&view=rev
Log:
2018-10-26  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/87105
	* tree-vectorizer.h (_slp_tree::refcnt): New member.
	* tree-vect-slp.c (vect_free_slp_tree): Decrement and honor
	refcnt.
	(vect_create_new_slp_node): Initialize refcnt to one.
	(bst_traits): Move.
	(scalar_stmts_set_t, bst_fail): Remove.
	(vect_build_slp_tree_2): Add bst_map argument and adjust calls.
	(vect_build_slp_tree): Add bst_map argument and lookup
	already created SLP nodes.
	(vect_print_slp_tree): Handle a SLP graph, print SLP node
	addresses.
	(vect_slp_rearrange_stmts): Handle a SLP graph.
	(vect_analyze_slp_instance): Adjust and free SLP nodes from
	the CSE map.  Fix indenting.
	(vect_schedule_slp_instance): Add short-cut.

	* g++.dg/vect/slp-pr87105.cc: Adjust.
	* gcc.dg/torture/20181024-1.c: New testcase.
	* g++.dg/opt/20181025-1.C: Likewise.

Added:
    trunk/gcc/testsuite/g++.dg/opt/20181025-1.C
    trunk/gcc/testsuite/gcc.dg/torture/20181024-1.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/testsuite/g++.dg/vect/slp-pr87105.cc
    trunk/gcc/tree-vect-slp.c
    trunk/gcc/tree-vectorizer.h
Comment 15 Richard Biener 2018-10-26 08:00:52 UTC
Fixed.  There's the redundant store left, the IPA SRA issue, but everything is vectorized now:

_Z16quadBoundingBoxAPK5PointR3Box:
.LFB1125:
        .cfi_startproc
        vmovupd (%rdi), %xmm1
        vmovupd (%rdi), %xmm7
        vminpd  32(%rdi), %xmm1, %xmm4
        vmaxpd  32(%rdi), %xmm7, %xmm7
        vmovups %xmm4, (%rsi)
        vmovups %xmm7, 16(%rsi)
        vmovupd 16(%rdi), %xmm2
        vmovupd (%rdi), %xmm6
        vmovupd 32(%rdi), %xmm1
        vaddpd  %xmm2, %xmm2, %xmm5
        vsubpd  %xmm2, %xmm6, %xmm3
        vaddpd  %xmm6, %xmm1, %xmm0
        vsubpd  %xmm5, %xmm0, %xmm0
        vmovapd .LC0(%rip), %xmm5
        vdivpd  %xmm0, %xmm3, %xmm3
        vxorpd  %xmm0, %xmm0, %xmm0
        vmaxpd  %xmm0, %xmm3, %xmm3
        vminpd  %xmm5, %xmm3, %xmm3
        vsubpd  %xmm3, %xmm5, %xmm5
        vmulpd  %xmm3, %xmm1, %xmm0
        vmulpd  %xmm5, %xmm6, %xmm6
        vmulpd  %xmm5, %xmm2, %xmm1
        vmulpd  %xmm3, %xmm2, %xmm2
        vaddpd  %xmm1, %xmm0, %xmm0
        vaddpd  %xmm6, %xmm2, %xmm2
        vmulpd  %xmm3, %xmm0, %xmm0
        vmulpd  %xmm5, %xmm2, %xmm2
        vaddpd  %xmm2, %xmm0, %xmm0
        vminpd  %xmm0, %xmm4, %xmm4
        vmaxpd  %xmm0, %xmm7, %xmm0
        vmovups %xmm4, (%rsi)
        vmovups %xmm0, 16(%rsi)
        ret
Comment 16 Petr 2018-10-26 12:24:02 UTC
Thanks a lot! I hope much more code would benefit from this change.