GCC is unable to autovectorize the following code. It seems that it doesn't like min/max, but I'm not entirely sure. I stripped the code off my project so it's a bit longer, hope that's fine. I attached also a code compiled by clang, which is perfectly vectorized and what I would like to get from GCC. The demonstration code ---------------------- #include <algorithm> #include <cmath> #include <stdint.h> // Point structure [x, y] struct Point { double x, y; inline Point() noexcept = default; constexpr Point(const Point&) noexcept = default; constexpr Point(double x, double y) noexcept : x(x), y(y) {} }; // Box structure [x0, y0, x1, y1] struct Box { double x0, y0, x1, y1; inline void reset(double x0, double y0, double x1, double y1) noexcept { this->x0 = x0; this->y0 = y0; this->x1 = x1; this->y1 = y1; } }; // Overloads to make vector processing simpler. static constexpr Point operator-(const Point& a) noexcept { return Point(-a.x, -a.y); } static constexpr Point operator+(const Point& a, double b) noexcept { return Point(a.x + b, a.y + b); } static constexpr Point operator-(const Point& a, double b) noexcept { return Point(a.x - b, a.y - b); } static constexpr Point operator*(const Point& a, double b) noexcept { return Point(a.x * b, a.y * b); } static constexpr Point operator/(const Point& a, double b) noexcept { return Point(a.x / b, a.y / b); } static constexpr Point operator+(const Point& a, const Point& b) noexcept { return Point(a.x + b.x, a.y + b.y); } static constexpr Point operator-(const Point& a, const Point& b) noexcept { return Point(a.x - b.x, a.y - b.y); } static constexpr Point operator*(const Point& a, const Point& b) noexcept { return Point(a.x * b.x, a.y * b.y); } static constexpr Point operator/(const Point& a, const Point& b) noexcept { return Point(a.x / b.x, a.y / b.y); } static constexpr Point operator+(double a, const Point& b) noexcept { return Point(a + b.x, a + b.y); } static constexpr Point operator-(double a, const Point& b) noexcept { return Point(a - b.x, a - b.y); } static constexpr Point operator*(double a, const Point& b) noexcept { return Point(a * b.x, a * b.y); } static constexpr Point operator/(double a, const Point& b) noexcept { return Point(a / b.x, a / b.y); } // Min/Max - different semantics compared to std. template<typename T> constexpr T myMin(const T& a, const T& b) noexcept { return b < a ? b : a; } template<typename T> constexpr T myMax(const T& a, const T& b) noexcept { return a < b ? b : a; } // Linear interpolation, works with points as well. template<typename V, typename T = double> inline V lerp(const V& a, const V& b, const T& t) noexcept { return (a * (1.0 - t)) + (b * t); } // Merge a point into a box by possibly increasing its bounds. inline void boxMergePoint(Box& box, const Point& p) noexcept { box.x0 = myMin(box.x0, p.x); box.y0 = myMin(box.y0, p.y); box.x1 = myMax(box.x1, p.x); box.y1 = myMax(box.y1, p.y); } void quadBoundingBoxA(const Point bez[3], Box& bBox) noexcept { // Bounding box of start and end points. bBox.reset(myMin(bez[0].x, bez[2].x), myMin(bez[0].y, bez[2].y), myMax(bez[0].x, bez[2].x), myMax(bez[0].y, bez[2].y)); Point t = (bez[0] - bez[1]) / (bez[0] - bez[1] * 2.0 + bez[2]); t.x = myMax(t.x, 0.0); t.y = myMax(t.y, 0.0); t.x = myMin(t.x, 1.0); t.y = myMin(t.y, 1.0); boxMergePoint(bBox, lerp(lerp(bez[0], bez[1], t), lerp(bez[1], bez[2], t), t)); } GCC Output [-std=c++17 -O3 -mavx2 -fno-math-errno] -------------------------------------------------- quadBoundingBoxA(Point const*, Box&): push rbp mov rbp, rsp and rsp, -32 vmovsd xmm1, QWORD PTR [rdi+8] vmovsd xmm0, QWORD PTR [rdi] vmovsd xmm5, QWORD PTR [rdi+40] vmovsd xmm6, QWORD PTR [rdi+32] vmaxsd xmm13, xmm5, xmm1 vmaxsd xmm12, xmm6, xmm0 vminsd xmm5, xmm5, xmm1 vminsd xmm6, xmm6, xmm0 vunpcklpd xmm0, xmm12, xmm13 vunpcklpd xmm1, xmm6, xmm5 vmovups XMMWORD PTR [rsi+16], xmm0 vmovups XMMWORD PTR [rsi], xmm1 vmovsd xmm2, QWORD PTR [rdi+24] vmovsd xmm10, QWORD PTR [rdi+8] vmovsd xmm1, QWORD PTR [rdi+40] vmovsd xmm7, QWORD PTR [rdi+16] vaddsd xmm4, xmm2, xmm2 vsubsd xmm9, xmm10, xmm2 vmovsd xmm3, QWORD PTR [rdi] vmovsd xmm0, QWORD PTR [rdi+32] vsubsd xmm8, xmm3, xmm7 vsubsd xmm4, xmm10, xmm4 vaddsd xmm4, xmm4, xmm1 vdivsd xmm9, xmm9, xmm4 vaddsd xmm4, xmm7, xmm7 vsubsd xmm4, xmm3, xmm4 vaddsd xmm4, xmm4, xmm0 vdivsd xmm8, xmm8, xmm4 vxorpd xmm4, xmm4, xmm4 vcomisd xmm4, xmm8 ja .L6 vcomisd xmm4, xmm9 jbe .L36 vmovsd xmm11, QWORD PTR .LC1[rip] vmulsd xmm14, xmm1, xmm4 vmulsd xmm9, xmm2, xmm4 vcomisd xmm8, xmm11 jbe .L37 vmovsd QWORD PTR [rsp-16], xmm2 vmovapd xmm1, xmm14 vmovapd xmm2, xmm9 vxorpd xmm14, xmm14, xmm14 vmovsd QWORD PTR [rsp-8], xmm7 vmulsd xmm3, xmm3, xmm4 vmovapd xmm15, xmm11 vmovapd xmm8, xmm11 vmulsd xmm7, xmm7, xmm4 vxorpd xmm9, xmm9, xmm9 jmp .L13 .L6: vmulsd xmm11, xmm7, xmm4 vcomisd xmm4, xmm9 vxorpd xmm8, xmm8, xmm8 vmulsd xmm0, xmm0, xmm4 vmovsd QWORD PTR [rsp-8], xmm11 vmovsd xmm11, QWORD PTR .LC1[rip] vmovapd xmm14, xmm11 jbe .L10 .L19: vmovsd QWORD PTR [rsp-16], xmm2 vmulsd xmm1, xmm1, xmm4 vmovapd xmm15, xmm11 vxorpd xmm9, xmm9, xmm9 vmulsd xmm2, xmm2, xmm4 jmp .L13 .L36: vmovsd xmm11, QWORD PTR .LC1[rip] vcomisd xmm8, xmm11 jbe .L29 vmovsd QWORD PTR [rsp-8], xmm7 vmulsd xmm3, xmm3, xmm4 vxorpd xmm14, xmm14, xmm14 vmovapd xmm8, xmm11 vmulsd xmm7, xmm7, xmm4 .L10: vcomisd xmm9, xmm11 jbe .L30 vmulsd xmm15, xmm2, xmm4 vmovapd xmm9, xmm11 vmulsd xmm10, xmm10, xmm4 vmovsd QWORD PTR [rsp-16], xmm15 vxorpd xmm15, xmm15, xmm15 .L13: vaddsd xmm1, xmm1, QWORD PTR [rsp-16] vaddsd xmm3, xmm3, QWORD PTR [rsp-8] vaddsd xmm2, xmm2, xmm10 vaddsd xmm0, xmm0, xmm7 vmulsd xmm9, xmm1, xmm9 vmulsd xmm15, xmm2, xmm15 vmulsd xmm8, xmm0, xmm8 vmulsd xmm14, xmm3, xmm14 vaddsd xmm9, xmm9, xmm15 vaddsd xmm14, xmm8, xmm14 vminsd xmm5, xmm9, xmm5 vmaxsd xmm9, xmm9, xmm13 vminsd xmm6, xmm14, xmm6 vmaxsd xmm14, xmm14, xmm12 vmovsd QWORD PTR [rsi+8], xmm5 vmovsd QWORD PTR [rsi+24], xmm9 vmovsd QWORD PTR [rsi], xmm6 vmovsd QWORD PTR [rsi+16], xmm14 leave ret .L29: vmulsd xmm15, xmm7, xmm8 vsubsd xmm14, xmm11, xmm8 vmulsd xmm0, xmm0, xmm8 vmulsd xmm3, xmm3, xmm14 vmulsd xmm7, xmm7, xmm14 vmovsd QWORD PTR [rsp-8], xmm15 jmp .L10 .L37: vmulsd xmm15, xmm7, xmm8 vsubsd xmm14, xmm11, xmm8 vmulsd xmm0, xmm0, xmm8 vmulsd xmm3, xmm3, xmm14 vmulsd xmm7, xmm7, xmm14 vmovsd QWORD PTR [rsp-8], xmm15 jmp .L19 .L30: vsubsd xmm15, xmm11, xmm9 vmulsd xmm1, xmm1, xmm9 vmulsd xmm4, xmm2, xmm15 vmulsd xmm10, xmm10, xmm15 vmulsd xmm2, xmm2, xmm9 vmovsd QWORD PTR [rsp-16], xmm4 jmp .L13 Clang Output [-std=c++17 -O3 -mavx2 -fno-math-errno] ---------------------------------------------------- .LCPI0_0: .quad 4607182418800017408 # double 1 .quad 4607182418800017408 # double 1 quadBoundingBoxA(Point const*, Box&): # @quadBoundingBoxA(Point const*, Box&) vmovupd xmm0, xmmword ptr [rdi] vmovupd xmm1, xmmword ptr [rdi + 16] vmovupd xmm2, xmmword ptr [rdi + 32] vminpd xmm3, xmm2, xmm0 vmaxpd xmm4, xmm2, xmm0 vsubpd xmm5, xmm0, xmm1 vaddpd xmm6, xmm1, xmm1 vsubpd xmm6, xmm0, xmm6 vaddpd xmm6, xmm2, xmm6 vdivpd xmm5, xmm5, xmm6 vxorpd xmm6, xmm6, xmm6 vmaxpd xmm5, xmm6, xmm5 vmovapd xmm6, xmmword ptr [rip + .LCPI0_0] # xmm6 = [1.000000e+00,1.000000e+00] vminpd xmm5, xmm6, xmm5 vsubpd xmm6, xmm6, xmm5 vmulpd xmm0, xmm0, xmm6 vmulpd xmm7, xmm1, xmm5 vaddpd xmm0, xmm7, xmm0 vmulpd xmm1, xmm1, xmm6 vmulpd xmm2, xmm2, xmm5 vaddpd xmm1, xmm2, xmm1 vmulpd xmm0, xmm6, xmm0 vmulpd xmm1, xmm5, xmm1 vaddpd xmm0, xmm0, xmm1 vminpd xmm1, xmm0, xmm3 vmovupd xmmword ptr [rsi], xmm1 vmaxpd xmm0, xmm0, xmm4 vmovupd xmmword ptr [rsi + 16], xmm0 ret
So I think it is an interesting interaction in that GCC cannot change a < b ? a : b into MIN_EXPR. There might be a reasoning behind this, dealing with NaNs, INF, etc.
One more point is in C++ (a < b ? b : a) is a lvalue which might also interfer with converting it into min/max.
With -ffast-math we (awkwardly) vectorize a couple min/max at the beginning, but clearly not the whole thing like llvm.
I think this code is vectorizable without --fast-math. However, it seems that once a min/max (or something else) is kept scalar it poisons the rest of the code. The following code works perfectly (scalar): ``` #include <algorithm> template<typename T> constexpr T altMinT(const T& a, const T& b) noexcept { return b < a ? b : a; } template<typename T> constexpr T altMaxT(const T& a, const T& b) noexcept { return a < b ? b : a; } double std_min(double a, double b) { return std::min(a, b); } double std_max(double a, double b) { return std::max(a, b); } double alt_min(double a, double b) { return altMinT(a, b); } double alt_max(double a, double b) { return altMaxT(a, b); } ``` I think that's the main problem - little samples are optimized well, complex code isn't.
We don't manage to use MIN_EXPR and get branchy code that gets optimized for boundary cases (when min/max pick 0.0/1.0). For the minimal input double f(double x, double y) { return y < x ? y : x; } we get MIN_EXPR in .gimple dump with C frontend and -ffinite-math-only -fno-signed-zeros, but not with C++ frontend. (and even then, hacking the source to use __builtin_fmin to get straight-line code, SLP does not manage to vectorize that)
I think the test-case can even be simplified to something like this: #include <algorithm> #include <cmath> struct Point { double x, y; void reset(double x, double y) { this->x = x; this->y = y; } }; void f1(Point* p, Point* a) { p->reset(std::max(std::sqrt(p->x), a->x), std::max(std::sqrt(p->y), a->y)); } GCC is unable to vectorize it: [-std=c++17 -O3 -mavx2 -fno-math-errno] f1(Point*, Point*): vsqrtsd xmm0, xmm0, QWORD PTR [rdi+8] vmovsd xmm1, QWORD PTR [rsi+8] vsqrtsd xmm2, xmm2, QWORD PTR [rdi] vmaxsd xmm1, xmm1, xmm0 vmovsd xmm0, QWORD PTR [rsi] vmaxsd xmm0, xmm0, xmm2 vunpcklpd xmm0, xmm0, xmm1 vmovups XMMWORD PTR [rdi], xmm0 ret whereas clang can: [-std=c++17 -O3 -mavx2 -fno-math-errno] f1(Point*, Point*): vsqrtpd xmm0, xmmword ptr [rdi] vmovupd xmm1, xmmword ptr [rsi] vmaxpd xmm0, xmm1, xmm0 vmovupd xmmword ptr [rdi], xmm0 ret I think this is a much simpler test-case to start with.
(In reply to Petr from comment #6) > I think the test-case can even be simplified to something like this: > > #include <algorithm> > #include <cmath> > > struct Point { > double x, y; > > void reset(double x, double y) { > this->x = x; > this->y = y; > } > }; > > void f1(Point* p, Point* a) { > p->reset(std::max(std::sqrt(p->x), a->x), > std::max(std::sqrt(p->y), a->y)); > } > > GCC is unable to vectorize it: > > [-std=c++17 -O3 -mavx2 -fno-math-errno] > f1(Point*, Point*): > vsqrtsd xmm0, xmm0, QWORD PTR [rdi+8] > vmovsd xmm1, QWORD PTR [rsi+8] > vsqrtsd xmm2, xmm2, QWORD PTR [rdi] > vmaxsd xmm1, xmm1, xmm0 > vmovsd xmm0, QWORD PTR [rsi] > vmaxsd xmm0, xmm0, xmm2 > vunpcklpd xmm0, xmm0, xmm1 > vmovups XMMWORD PTR [rdi], xmm0 > ret > > whereas clang can: > > [-std=c++17 -O3 -mavx2 -fno-math-errno] > f1(Point*, Point*): > vsqrtpd xmm0, xmmword ptr [rdi] > vmovupd xmm1, xmmword ptr [rsi] > vmaxpd xmm0, xmm1, xmm0 > vmovupd xmmword ptr [rdi], xmm0 > ret > > I think this is a much simpler test-case to start with. With -Ofast -mavx2 I get _Z2f1P5PointS0_: .LFB1123: .cfi_startproc vsqrtpd (%rdi), %xmm0 vmaxpd (%rsi), %xmm0, %xmm0 vmovups %xmm0, (%rdi) ret as said elsewhere min/max recognition is the issue here and return a < b ? b : a; isn't the same as fmax or MAX_EXPR without further constraints. For the original testcase GCC thinks some vectorization isn't profitable and even with -Ofast GCC manages to trick itself in a corner where MIN/MAX_EXPR detection fails to produce straight-line code. C++ abstraction doesn't help here - we inline the min/max functions before post-inline passes that would have cleaned up them nicely. In particular phiopt is run a bit late and is confused bu jump threading we perform in VRP. -fno-tree-vrp -Ofast -mavx2 generates straight-line non-vectorized code which the vectorizer thinks is not profitable to vectorize. In the end we are confused by the redundant stores (the vectorizer has some too simplified code to handle this): <bb 2> [local count: 1073741825]: _12 = MEM[(const double &)bez_1(D) + 8]; _13 = MEM[(const double &)bez_1(D) + 40]; iftmp.0_75 = MAX_EXPR <_12, _13>; _14 = MEM[(const double &)bez_1(D)]; _15 = MEM[(const double &)bez_1(D) + 32]; iftmp.0_74 = MAX_EXPR <_14, _15>; iftmp.1_73 = MIN_EXPR <_12, _13>; iftmp.1_72 = MIN_EXPR <_14, _15>; bBox_4(D)->x0 = iftmp.1_72; bBox_4(D)->y0 = iftmp.1_73; bBox_4(D)->x1 = iftmp.0_74; bBox_4(D)->y1 = iftmp.0_75; we vectorize up to here with -fno-vect-cost-model _6 = MEM[(double *)bez_1(D) + 16B]; _7 = MEM[(double *)bez_1(D) + 24B]; _50 = _7 * 2.0e+0; _51 = _6 * 2.0e+0; _10 = MEM[(double *)bez_1(D)]; _11 = MEM[(double *)bez_1(D) + 8B]; _8 = MEM[(double *)bez_1(D) + 32B]; _3 = _8 + _10; _9 = MEM[(double *)bez_1(D) + 40B]; _5 = _9 + _11; _46 = _5 - _50; _47 = _3 - _51; _44 = _11 - _7; ... the stores to bBox_4 alias the loads from bez_1 which due to C++ abstraction from the min/max functions are just double * loads from TBAA perspective. Given there are no visible stores to bez_1 we cannot assume any containing dynamic type here. ... iftmp.1_67 = MIN_EXPR <_23, iftmp.1_72>; bBox_4(D)->x0 = iftmp.1_67; iftmp.1_66 = MIN_EXPR <_22, iftmp.1_73>; bBox_4(D)->y0 = iftmp.1_66; iftmp.0_65 = MAX_EXPR <_23, iftmp.0_74>; bBox_4(D)->x1 = iftmp.0_65; iftmp.0_64 = MAX_EXPR <_22, iftmp.0_75>; bBox_4(D)->y1 = iftmp.0_64; return; Lifting the BB vectorization restriction is on my todo list: bool vect_analyze_data_ref_accesses (vec_info *vinfo) { ... /* Do not place the same access in the interleaving chain twice. */ if (init_b == init_prev) { gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) < gimple_uid (DR_STMT (drb))); /* ??? For now we simply "drop" the later reference which is otherwise the same rather than finishing off this group. In the end we'd want to re-process duplicates forming multiple groups from the refs, likely by just collecting all candidates (including duplicates and split points below) in a vector and then process them together. */ continue; }
I've also had patches adding an early phiopt pass which would have solved the CFG mess VRP creates.
For the original testcase there's also the issue that the bez[] accesses end up as _68 = MEM[(double *)bez_27(D) + 16B]; _69 = MEM[(double *)bez_27(D) + 24B]; and thus they possibly alias with the earlier stores bBox_33(D)->x0 = iftmp.1_190; bBox_33(D)->y0 = iftmp.1_193; bBox_33(D)->x1 = iftmp.0_196; bBox_33(D)->y1 = iftmp.0_199; which we subsequently fail to eliminate against the final ones iftmp.1_49 = MIN_EXPR <_119, iftmp.1_190>; bBox_33(D)->x0 = iftmp.1_49; iftmp.1_31 = MIN_EXPR <_118, iftmp.1_193>; bBox_33(D)->y0 = iftmp.1_31; iftmp.0_102 = MAX_EXPR <_119, iftmp.0_196>; bBox_33(D)->x1 = iftmp.0_102; iftmp.0_105 = MAX_EXPR <_118, iftmp.0_199>; bBox_33(D)->y1 = iftmp.0_105; if we'd eliminate the earlier ones then vectorization would have had a chance here. That's from // Linear interpolation, works with points as well. template<typename V, typename T = double> inline V lerp(const V& a, const V& b, const T& t) noexcept { return (a * (1.0 - t)) + (b * t); } and // Min/Max - different semantics compared to std. template<typename T> constexpr T myMin(const T& a, const T& b) noexcept { return b < a ? b : a; } template<typename T> constexpr T myMax(const T& a, const T& b) noexcept { return a < b ? b : a; } taking references to double. Plus it is because IPA SRA decomposing the by reference Point passing to _17 = MEM[(double *)b_2(D)]; _18 = MEM[(double *)b_2(D) + 8B]; _19 = MEM[(double *)&t]; _20 = MEM[(double *)&t + 8B]; D.19260 = _ZmlRK5PointS1_.isra.5 (_17, _18, _19, _20); that's quite bad for TBAA ... the function uses b->x, etc. to access the memory.
Author: rguenth Date: Tue Oct 23 11:34:56 2018 New Revision: 265421 URL: https://gcc.gnu.org/viewcvs?rev=265421&root=gcc&view=rev Log: 2018-10-23 Richard Biener <rguenther@suse.de> PR tree-optimization/87105 PR tree-optimization/87608 * passes.def (pass_all_early_optimizations): Add early phi-opt after dce. * tree-ssa-phiopt.c (value_replacement): Ignore NOPs and predicts in addition to debug stmts. (tree_ssa_phiopt_worker): Add early_p argument, do only min/max and abs replacement early. * tree-cfg.c (gimple_empty_block_p): Likewise. * g++.dg/tree-ssa/phiopt-1.C: New testcase. g++.dg/vect/slp-pr87105.cc: Likewise. * g++.dg/tree-ssa/pr21463.C: Scan phiopt2 because this testcase relies on phiprop run before. * g++.dg/tree-ssa/pr30738.C: Likewise. * g++.dg/tree-ssa/pr57380.C: Likewise. * gcc.dg/tree-ssa/pr84859.c: Likewise. * gcc.dg/tree-ssa/pr45397.c: Scan phiopt2 because phiopt1 is confused by copies in the IL left by EVRP. * gcc.dg/tree-ssa/phi-opt-5.c: Likewise, this time confused by predictors. * gcc.dg/tree-ssa/phi-opt-12.c: Scan phiopt2. * gcc.dg/pr24574.c: Likewise. * g++.dg/tree-ssa/pr86544.C: Scan phiopt4. Added: trunk/gcc/testsuite/g++.dg/tree-ssa/phiopt-1.C trunk/gcc/testsuite/g++.dg/vect/slp-pr87105.cc Modified: trunk/gcc/ChangeLog trunk/gcc/passes.def trunk/gcc/testsuite/ChangeLog trunk/gcc/testsuite/g++.dg/tree-ssa/pr21463.C trunk/gcc/testsuite/g++.dg/tree-ssa/pr30738.C trunk/gcc/testsuite/g++.dg/tree-ssa/pr57380.C trunk/gcc/testsuite/g++.dg/tree-ssa/pr86544.C trunk/gcc/testsuite/gcc.dg/pr24574.c trunk/gcc/testsuite/gcc.dg/tree-ssa/20040514-1.c trunk/gcc/testsuite/gcc.dg/tree-ssa/20040518-1.c trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-12.c trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-5.c trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-6.c trunk/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-8.c trunk/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c trunk/gcc/testsuite/gcc.dg/tree-ssa/pr45397.c trunk/gcc/testsuite/gcc.dg/tree-ssa/pr84859.c trunk/gcc/testsuite/gcc.dg/uninit-15.c trunk/gcc/tree-cfg.c trunk/gcc/tree-ssa-phiopt.c
The code is now better but not vectorized due to mentioned issues.
With the duplicate store issue fixed in the vectorizer we run into the SLP vectorization issue that limits the growth of the SLP tree (yes, it's a tree and thus tends to grow expontential easily...).
Author: rguenth Date: Wed Oct 24 11:46:58 2018 New Revision: 265457 URL: https://gcc.gnu.org/viewcvs?rev=265457&root=gcc&view=rev Log: 2018-10-24 Richard Biener <rguenther@suse.de> PR tree-optimization/87105 * tree-vect-data-refs.c (vect_analyze_group_access_1): Adjust dump classification. (vect_analyze_data_ref_accesses): Handle duplicate loads and stores by splitting the affected group after the fact. * tree-vect-slp.c (vect_build_slp_tree_2): Dump when we fail the SLP build because of size constraints. * gcc.dg/vect/bb-slp-39.c: New testcase. * gfortran.dg/vect/pr83232.f90: Un-XFAIL. Added: trunk/gcc/testsuite/gcc.dg/vect/bb-slp-39.c Modified: trunk/gcc/ChangeLog trunk/gcc/testsuite/ChangeLog trunk/gcc/testsuite/gfortran.dg/vect/pr83232.f90 trunk/gcc/tree-vect-data-refs.c trunk/gcc/tree-vect-slp.c
Author: rguenth Date: Fri Oct 26 07:38:59 2018 New Revision: 265522 URL: https://gcc.gnu.org/viewcvs?rev=265522&root=gcc&view=rev Log: 2018-10-26 Richard Biener <rguenther@suse.de> PR tree-optimization/87105 * tree-vectorizer.h (_slp_tree::refcnt): New member. * tree-vect-slp.c (vect_free_slp_tree): Decrement and honor refcnt. (vect_create_new_slp_node): Initialize refcnt to one. (bst_traits): Move. (scalar_stmts_set_t, bst_fail): Remove. (vect_build_slp_tree_2): Add bst_map argument and adjust calls. (vect_build_slp_tree): Add bst_map argument and lookup already created SLP nodes. (vect_print_slp_tree): Handle a SLP graph, print SLP node addresses. (vect_slp_rearrange_stmts): Handle a SLP graph. (vect_analyze_slp_instance): Adjust and free SLP nodes from the CSE map. Fix indenting. (vect_schedule_slp_instance): Add short-cut. * g++.dg/vect/slp-pr87105.cc: Adjust. * gcc.dg/torture/20181024-1.c: New testcase. * g++.dg/opt/20181025-1.C: Likewise. Added: trunk/gcc/testsuite/g++.dg/opt/20181025-1.C trunk/gcc/testsuite/gcc.dg/torture/20181024-1.c Modified: trunk/gcc/ChangeLog trunk/gcc/testsuite/ChangeLog trunk/gcc/testsuite/g++.dg/vect/slp-pr87105.cc trunk/gcc/tree-vect-slp.c trunk/gcc/tree-vectorizer.h
Fixed. There's the redundant store left, the IPA SRA issue, but everything is vectorized now: _Z16quadBoundingBoxAPK5PointR3Box: .LFB1125: .cfi_startproc vmovupd (%rdi), %xmm1 vmovupd (%rdi), %xmm7 vminpd 32(%rdi), %xmm1, %xmm4 vmaxpd 32(%rdi), %xmm7, %xmm7 vmovups %xmm4, (%rsi) vmovups %xmm7, 16(%rsi) vmovupd 16(%rdi), %xmm2 vmovupd (%rdi), %xmm6 vmovupd 32(%rdi), %xmm1 vaddpd %xmm2, %xmm2, %xmm5 vsubpd %xmm2, %xmm6, %xmm3 vaddpd %xmm6, %xmm1, %xmm0 vsubpd %xmm5, %xmm0, %xmm0 vmovapd .LC0(%rip), %xmm5 vdivpd %xmm0, %xmm3, %xmm3 vxorpd %xmm0, %xmm0, %xmm0 vmaxpd %xmm0, %xmm3, %xmm3 vminpd %xmm5, %xmm3, %xmm3 vsubpd %xmm3, %xmm5, %xmm5 vmulpd %xmm3, %xmm1, %xmm0 vmulpd %xmm5, %xmm6, %xmm6 vmulpd %xmm5, %xmm2, %xmm1 vmulpd %xmm3, %xmm2, %xmm2 vaddpd %xmm1, %xmm0, %xmm0 vaddpd %xmm6, %xmm2, %xmm2 vmulpd %xmm3, %xmm0, %xmm0 vmulpd %xmm5, %xmm2, %xmm2 vaddpd %xmm2, %xmm0, %xmm0 vminpd %xmm0, %xmm4, %xmm4 vmaxpd %xmm0, %xmm7, %xmm0 vmovups %xmm4, (%rsi) vmovups %xmm0, 16(%rsi) ret
Thanks a lot! I hope much more code would benefit from this change.