This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug tree-optimization/68502] [6 Regression][i686] spec2000/179.art runfails after r222914


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68502

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
Program received signal SIGSEGV, Segmentation fault.
0x0804a482 in train_match (spot=0) at scanner.c:407
407               f1_layer[ti].W = f1_layer[ti].I[cp] + a*(f1_layer[ti].U);
(gdb) l
402
403           /* Compute F1 layer - W values */
404           tnorm = 0;
405           for (ti=0;ti<numf1s;ti++)
406           {
407               f1_layer[ti].W = f1_layer[ti].I[cp] + a*(f1_layer[ti].U);
408               tnorm += f1_layer[ti].W * f1_layer[ti].W;
409           }
410           tnorm =  sqrt((double)tnorm);
411           /* Compute F1 layer - X values */
(gdb) disassemble
...
   0x0804a478 <+568>:   fldl   0x1c(%eax)
   0x0804a47b <+571>:   mov    (%eax),%ecx
   0x0804a47d <+573>:   add    $0x3c,%eax
   0x0804a480 <+576>:   fmul   %st(2),%st
=> 0x0804a482 <+578>:   faddl  (%ecx,%esi,1)
   0x0804a485 <+581>:   fstl   -0x38(%eax)
   0x0804a488 <+584>:   fmul   %st(0),%st
   0x0804a48a <+586>:   faddp  %st,%st(1)
   0x0804a48c <+588>:   cmp    %eax,%ebx
   0x0804a48e <+590>:   jne    0x804a478 <train_match+568>

even without -mfpmath=sse here.  ecx and esi are 0.  This is the load
f1_layer[ti].I[cp] where f1_layer[ti].I is NULL (%ecx) and cp is 0 (%esi).

We vectorize the loop in reset_nodes()

   for (i=0;i<numf1s;i++)
   {
     f1_layer[i].W = 0.0;
     f1_layer[i].X = 0.0;
     f1_layer[i].V = 0.0;
     f1_layer[i].U = 0.0;
     f1_layer[i].P = 0.0;
     f1_layer[i].Q = 0.0;
     f1_layer[i].R = 0.0;
   }

which does not clear .I but the vectorized variant does:

   0x0804a28a <+74>:    vxorpd %xmm0,%xmm0,%xmm0
   0x0804a28e <+78>:    add    $0x1,%esi
   0x0804a291 <+81>:    vmovupd %ymm0,(%edx)
   0x0804a295 <+85>:    add    $0xe0,%edx
   0x0804a29b <+91>:    vmovupd %ymm0,-0xc0(%edx)
   0x0804a2a3 <+99>:    vmovupd %ymm0,-0xa0(%edx)
   0x0804a2ab <+107>:   vmovupd %ymm0,-0x80(%edx)
   0x0804a2b0 <+112>:   vmovupd %ymm0,-0x60(%edx)
   0x0804a2b5 <+117>:   vmovupd %ymm0,-0x40(%edx)
   0x0804a2ba <+122>:   vmovupd %ymm0,-0x20(%edx)
   0x0804a2bf <+127>:   cmp    %ebx,%esi
   0x0804a2c1 <+129>:   jb     0x804a28e <train_match+78>


testcase:

typedef struct {
    double *I;
    double W;
    double X;
    double V;
    double U;
    double P;
    double Q;
    double R;
} f1_neuron;

f1_neuron *f1_layer;

int numf1s = 1000;

void __attribute__((noinline,noclone))
reset_nodes()
{ int i;

  for (i=0;i<numf1s;i++)
    {
      f1_layer[i].W = 0.0;
      f1_layer[i].X = 0.0;
      f1_layer[i].V = 0.0;
      f1_layer[i].U = 0.0;
      f1_layer[i].P = 0.0;
      f1_layer[i].Q = 0.0;
      f1_layer[i].R = 0.0;
    }
}

int main ()
{
  int i;
  f1_layer = (f1_neuron *)malloc (numf1s * sizeof (f1_neuron));
  for (i = 0; i < numf1s; i++)
    f1_layer[i].I = (double *)-1;
  reset_nodes ();
  for (i = 0; i < numf1s; i++)
    if (f1_layer[i].I != (double *)-1)
      abort ();
  return 0;
}

The AVX2 dependence is just a cost model issue, -fno-vect-cost-model makes it
fail with SSE2 (but only -m32 still).

Ah, it depends on a GAP that is not a multiple of the vector element size!
32bit pointer gap but 64bit vector element.  Which can also only happen
because double is aligned to 32bits only.

Interesting case ;)

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]