Bug 79151 - Missed BB vectorization with strided/scalar stores
Summary: Missed BB vectorization with strided/scalar stores
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 7.0
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer
  Show dependency treegraph
 
Reported: 2017-01-19 18:16 UTC by Thomas Koenig
Modified: 2017-02-20 17:27 UTC (History)
0 users

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2017-01-20 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Thomas Koenig 2017-01-19 18:16:11 UTC
Consider the following code. The function "scalar" contains two formulas in a
function which are identical, except for the coefficients which
differ.

This could be vectorized.  As an example of how this could be done,
see the function "vector" where vectorization intrinsics are used.

You will see that "vector" is much shorter; all the operations are
done using vector intrinsics.

This is for x86_64-pc-linux-gnu.

#include <stdio.h>
 
void scalar(const double *restrict a, const double *restrict b,
        double x, double *ar, double *br)
{
  double ra, rb;
  int i;
 
  ra = a[0] + a[1]/x - 1.0/(a[0]-a[1]);
  rb = b[0] + b[1]/x - 1.0/(b[0]-b[1]);
 
  *ar = ra;
  *br = rb;
}
 
void vector(const double *restrict a, const double *restrict b,
        double x, double *ar, double *br)
{
  typedef double v2do __attribute__((vector_size (16)));
  v2do c0, c1, r;
 
  c0[0] = a[0];
  c0[1] = b[0];
  c1[0] = a[1];
  c1[1] = b[1];
 
  r = c0 + c1/x - 1.0/(c0-c1);
  *ar = r[0];
  *br = r[1];
}
 
double a[] = {1.0, -1.5};
double b[] = {1.3, -1.2};
 
int main()
{
  double x = 1.24;
  double ar, br;
 
  scalar(a, b, x, &ar, &br);
  printf("%f %f\n", ar, br);
  vector(a, b, x, &ar, &br);
  printf("%f %f\n", ar, br);
 
  return 0;
}

Assembly for the function "scalar":

scalar:
.LFB11:
        .cfi_startproc
        movsd   8(%rdi), %xmm4
        movsd   8(%rsi), %xmm5
        movapd  %xmm4, %xmm1
        movsd   (%rdi), %xmm2
        movapd  %xmm5, %xmm7
        divsd   %xmm0, %xmm1
        divsd   %xmm0, %xmm7
        addsd   %xmm2, %xmm1
        subsd   %xmm4, %xmm2
        movapd  %xmm2, %xmm4
        movsd   (%rsi), %xmm3
        movsd   .LC0(%rip), %xmm2
        movapd  %xmm7, %xmm0
        movapd  %xmm2, %xmm6
        addsd   %xmm3, %xmm0
        subsd   %xmm5, %xmm3
        divsd   %xmm4, %xmm6
        divsd   %xmm3, %xmm2
        subsd   %xmm6, %xmm1
        movsd   %xmm1, (%rdx)
        subsd   %xmm2, %xmm0
        movsd   %xmm0, (%rcx)
        ret

Assembly for the function "vector":

vector:
.LFB12:
        .cfi_startproc
        movsd   8(%rsi), %xmm2
        movsd   8(%rdi), %xmm3
        unpcklpd        %xmm0, %xmm0
        unpcklpd        %xmm2, %xmm3
        movapd  .LC1(%rip), %xmm2
        movsd   (%rdi), %xmm1
        movapd  %xmm3, %xmm4
        movhpd  (%rsi), %xmm1
        divpd   %xmm0, %xmm4
        movapd  %xmm4, %xmm0
        addpd   %xmm1, %xmm0
        subpd   %xmm3, %xmm1
        divpd   %xmm1, %xmm2
        addpd   %xmm2, %xmm0
        movlpd  %xmm0, (%rdx)
        movhpd  %xmm0, (%rcx)
        ret
Comment 1 Richard Biener 2017-01-20 09:19:47 UTC
The basic-block vectorizer does not yet consider strided/scalar stores as a source in its search for vectorization opportunities so it gives up very early.  Basically it searchs for groups of stores that can be vectorized with a vector store and
then looks at how many of the feeding stmts it can include.

Handling this particular case is hard in the current scheme (or rather expensive).

Confirmed.

"Fixing" the testcase to

void scalar(const double *restrict a, const double *restrict b,
            double x, double *ar, double *br)
{
  double ra, rb;
  int i;

  ra = a[0] + a[1]/x - 1.0/(a[0]-a[1]);
  rb = b[0] + b[1]/x - 1.0/(b[0]-b[1]);

  ar[0] = ra;
  ar[1] = rb;
}

fails as well with

t.c:12:1: note: Build SLP for _1 = *a_14(D);
t.c:12:1: note: Build SLP for _7 = *b_17(D);
t.c:12:1: note: Build SLP failed: different interleaving chains in one node _7 = *b_17(D);
t.c:12:1: note: Re-trying with swapped operands of stmts 1
t.c:12:1: note: Build SLP for _1 = *a_14(D);
t.c:12:1: note: Build SLP for _9 = _8 / x_15(D);
t.c:12:1: note: Build SLP failed: different operation in stmt _9 = _8 / x_15(D);
t.c:12:1: note: original stmt _1 = *a_14(D);

but we could handle this with "construction from scalars" and just get
confused by the first mismatch and optimistically trying to swap operands.

As said above the SLP finding algorithm is very much too greedy (with too many
accumulated hacks).
Comment 2 Thomas Koenig 2017-02-20 07:24:04 UTC
Another test case.

It might even be profitable just to look for divisions, because these
are so expensive that packing/unpacking should always be
profitable.

double foo(double a, double b)
{
  return 1/a + 1/b;
}

double v_foo (double a, double b)
{
   typedef double v2do __attribute__((vector_size (16)));
   v2do x, y;

   x[0] = a;
   x[1] = b;
   y = 1/x;
   return y[0] + y[1];
}

Assembly:  foo is

        movsd   .LC0(%rip), %xmm2
        movapd  %xmm2, %xmm3
        divsd   %xmm1, %xmm2
        divsd   %xmm0, %xmm3
        movapd  %xmm3, %xmm0
        addsd   %xmm2, %xmm0
        ret

and v_foo is

        unpcklpd        %xmm1, %xmm0
        movapd  .LC1(%rip), %xmm1
        divpd   %xmm0, %xmm1
        movapd  %xmm1, %xmm2
        unpckhpd        %xmm1, %xmm1
        movapd  %xmm1, %xmm0
        addsd   %xmm2, %xmm0
        ret
Comment 3 Richard Biener 2017-02-20 09:03:37 UTC
The question is of course whether vector division has comparable latency / throughput as the scalar one.
Comment 4 Andrew Pinski 2017-02-20 09:10:49 UTC
(In reply to Richard Biener from comment #3)
> The question is of course whether vector division has comparable latency /
> throughput as the scalar one.

On the cores that cavium produces the answer is yes for most vector types.
Comment 5 Thomas Koenig 2017-02-20 16:57:25 UTC
(In reply to Richard Biener from comment #3)
> The question is of course whether vector division has comparable latency /
> throughput as the scalar one.

Here's a test case on a rather old CPU, a Core 2 Q8200:

$ cat foo.c
#include <stdio.h>

double foo(double a, double b)
#ifdef SCALAR

{
  return 1/a + 1/b;
}
#else
{
  typedef double v2do __attribute__((vector_size (16)));
  v2do x, y;
  
  x[0] = a;
  x[1] = b;
  y = 1/x;
  return y[0] + y[1];
}
#endif

#define NMAX 1000000000

int main()
{
  double a, b, s;
  s = 0.0;
  for (a=1.0; a<NMAX+0.5; a++)
    {
      b = a+0.5;
      s += foo(a,b);
    }
  printf("%f\n", s);
}
$ gcc -DSCALAR -O3 foo.c && time ./a.out
41.987257

real    0m19,508s
user    0m19,500s
sys     0m0,000s
$ gcc -O3 foo.c && time ./a.out
41.987257

real    0m9,146s
user    0m9,140s
sys     0m0,000s
Comment 6 Thomas Koenig 2017-02-20 17:27:56 UTC
A few more test cases with a relatively recent trunk.

POWER7:

[tkoenig@gcc1-power7 ~]$ gcc -mcpu=power7 -O3 foo.c && time ./a.out
41.987257

real    0m3.688s
user    0m3.685s
sys     0m0.002s
[tkoenig@gcc1-power7 ~]$ gcc -mcpu=power7 -DSCALAR -O3 foo.c && time ./a.out
41.987257

real    0m7.317s
user    0m7.314s
sys     0m0.003s

Core(TM) i7-2600:

tkoenig@gcc75:~$ gcc -march=native -O3 foo.c && time ./a.out
41.987257

real    0m5.848s
user    0m5.848s
sys     0m0.000s
tkoenig@gcc75:~$ gcc -march=native -DSCALAR -O3 foo.c && time ./a.out
41.987257

real    0m11.638s
user    0m11.640s
sys     0m0.000s

AMD Athlon(tm) II X4 640:

tkoenig@gcc45:~$ gcc -O3 -march=native foo.c && time ./a.out
41.987257

real    0m8.467s
user    0m7.648s
sys     0m0.000s
tkoenig@gcc45:~$ gcc -O3 -DSCALAR -march=native foo.c && time ./a.out
41.987257

real    0m15.308s
user    0m14.284s
sys     0m0.004s


The exception is with aarch64, an AMD Opteron 1100:

tkoenig@gcc117:~> gcc -O3 -march=native foo.c && time ./a.out 
41.987257

real    0m30.013s
user    0m30.010s
sys     0m0.000s
tkoenig@gcc117:~> gcc -O3 -DSCALAR -march=native foo.c && time ./a.out 
41.987257

real    0m30.013s
user    0m30.010s
sys     0m0.000s

Soo... not always profitable.