This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug target/34682] 70% slowdown with SSE enabled



------- Comment #5 from ubizjak at gmail dot com  2008-01-07 12:19 -------
Confirmed by following testcase:

--cut here--
#include <stdio.h>

void __attribute__((noinline))
dtime (void) 
{
  __asm__ __volatile__ ("" : : : "memory");
}

double sa, sb, sc, sd;
double one, two, four, five;
double piref, piprg, pierr;

int
main (int argc, char *argv[])
{
  double s, u, v, w, x;

  long i, m;

  piref = 3.14159265358979324;
  one = 1.0;
  two = 2.0;
  four = 4.0;
  five = 5.0;

  m = 512000000;

  dtime();

  s = -five;
  sa = -one;

  dtime();

  for (i = 1; i <= m; i++)
    {
      s = -s;
      sa = sa + s;
    }

  dtime();

  sc = (double) m;

  u = sa;
  v = 0.0;
  w = 0.0;
  x = 0.0;

  dtime();

  for (i = 1; i <= m; i++)
    {
      s = -s;
      sa = sa + s;
      u = u + two;
      x = x + (s - u);
      v = v - s * u;
      w = w + s / u;
    }

  dtime();

  m = (long) (sa * x / sc);
  sa = four * w / five;
  sb = sa + five / v;
  sc = 31.25;
  piprg = sb - sc / (v * v * v);
  pierr = piprg - piref;

  printf ("%13.4le\n", pierr);
  return 0;
}
--cut here--

.L5:
        xorb    $-128, -17(%ebp)        #, s
        addl    $1, %eax        #, i.65
        addsd   %xmm4, %xmm1    # two.16, u
        cmpl    $512000001, %eax        #, i.65
        movsd   -24(%ebp), %xmm0        # s, tmp90
        addsd   -24(%ebp), %xmm2        # s, sa_lsm.48
        mulsd   %xmm1, %xmm0    # u, tmp90
        subsd   %xmm0, %xmm3    # tmp90, v
        movsd   -24(%ebp), %xmm0        # s, tmp91
        divsd   %xmm1, %xmm0    # u, tmp91
        addsd   -16(%ebp), %xmm0        # w, tmp91
        movsd   %xmm0, -16(%ebp)        # tmp91, w
        jne     .L5     #,


It is somehow possible to tolerate that "s" and "w" are not pushed into
registers due to non-existent live range splitting (PR 23322), the main problem
here is that the sign of "s"is changed in the memory by using (unaligned) xorb
insn. The same situation is in the first (shorter) loop:

.L4:
        xorb    $-128, -17(%ebp)        #, s
        addl    $1, %eax        #, i
        cmpl    $512000001, %eax        #, i
        addsd   -24(%ebp), %xmm0        # s, sa_lsm.97
        jne     .L4     #,


The performance regression is caused by partial memory stall [1].

[1] Agner Fog: How to optimize for the Pentium family of microprocessors,
section 14.7


-- 

ubizjak at gmail dot com changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
     Ever Confirmed|0                           |1
   Last reconfirmed|0000-00-00 00:00:00         |2008-01-07 12:19:54
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34682


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]