optimization/4714: Haney's real matrix performance regression

Sun Oct 28 03:47:00 GMT 2001

>Number:         4714
>Category:       optimization
>Synopsis:       Haney's real matrix performance regression
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    unassigned
>State:          open
>Class:          pessimizes-code
>Submitter-Id:   net
>Arrival-Date:   Sun Oct 28 03:46:00 PST 2001
>Closed-Date:
>Last-Modified:
>Originator:     Paolo Carlini <pcarlini@unitus.it>
>Release:        3.1 20011024(5,6,7)
>Organization:
>Environment:
i686-pc-linux-gnu (PII-400, Linux)
>Description:
Since October, 24th gcc3.1 snapshots are not able anymore
to optimize well the "O-O C++" version of the Real Matrix
test in the Haney suite (
ftp://ftp.kai.com/pub/benchmarks/haney_1p5.tar.gz )

This is the mis-optimized triple loop:

////////////////
class RealMatrix {
public:

  float &index(int i, int j)
    {
      return d[i - 1 + n[0] * (j - 1)];
    }
  float index(int i, int j) const
    {
      return d[i - 1 + n[0] * (j - 1)];
    }

  int dim(int i) const { return n[i - 1]; }

private:

  float *d;
  int n[4];
};

void rmatMul(RealMatrix &t, const RealMatrix &a,
const RealMatrix &b)
{
  const int M = a.dim(1), N = b.dim(2), K = b.dim(1);

  for (int j = 1; j <= N; j++)
    {
      for (int k = 1; k <= K; k++)
        {
          float temp = b.index(k, j);
          if (temp != 0.0)
            {
              for (int i = 1; i <= M; i++)
                t.index(i, j) += temp * a.index(i, k);
            }
        }
    }
}
////////////////

If you compile it at -O2 on i686 you ends up with the
following assembler for the innermost loop:

  80:   8b 4d 08                mov    0x8(%ebp),%ecx
  83:   d9 c0                   fld    %st(0)
  85:   8b 75 0c                mov    0xc(%ebp),%esi
  88:   8b 51 04                mov    0x4(%ecx),%edx
  8b:   8b 01                   mov    (%ecx),%eax
  8d:   8b 4e 04                mov    0x4(%esi),%ecx
  90:   0f af d7                imul   %edi,%edx
  93:   8d 14 1a                lea    (%edx,%ebx,1),%edx
  96:   8d 14 90                lea    (%eax,%edx,4),%edx
  99:   8b 45 dc                mov    0xffffffdc(%ebp),%eax
  9c:   0f af c1                imul   %ecx,%eax
  9f:   8b 0e                   mov    (%esi),%ecx
  a1:   8d 04 18                lea    (%eax,%ebx,1),%eax
  a4:   43                      inc    %ebx
  a5:   d8 4c 81 fc             fmuls  0xfffffffc(%ecx,%eax,4)
  a9:   3b 5d f0                cmp    0xfffffff0(%ebp),%ebx
  ac:   d8 42 fc                fadds  0xfffffffc(%edx)
  af:   d9 5a fc                fstps  0xfffffffc(%edx)
  b2:   7e cc                   jle    80 <rmatMul(RealMatrix&, RealMatrix const&, RealMatrix const&)+0x80>

Compare it with the code produced by the previous snapshots
(f.i., 20011007 from gcc.gnu.org) or gcc3.0.2:

  d0:   8d 04 0e                lea    (%esi,%ecx,1),%eax
  d3:   d9 c0                   fld    %st(0)
  d5:   41                      inc    %ecx
  d6:   d8 4c 83 fc             fmuls  0xfffffffc(%ebx,%eax,4)
  da:   d8 42 fc                fadds  0xfffffffc(%edx)
  dd:   d9 5a fc                fstps  0xfffffffc(%edx)
  e0:   83 c2 04                add    $0x4,%edx
  e3:   3b 4d f0                cmp    0xfffffff0(%ebp),%ecx
  e6:   7e e8                   jle    d0 <rmatMul(RealMatrix&, RealMatrix const&, RealMatrix const&)+0xd0>

The Haney test in question becomes 2x slower :-( 
>How-To-Repeat:
Confer also to my
http://gcc.gnu.org/ml/gcc/2001-06/msg01554.html
for another recent slowdown in the Haney testsuite
>Fix:

>Release-Note:
>Audit-Trail:
>Unformatted: