This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Re: 3.0-pre vs 3.0 on Haney speed


Hi again,

thanks to some nice tips from Andi (and the encourgement of Joe) I think I have
made some progress in the analysis of the annoying slowdown which I reported
some days ago.

First, I verified that the problem is not with the loop unrolling optimization:
I ran again the test and obtained essentially the same behavior (modulo
statistical fluctuations, as usual):

---
-O2
---
gcc version 3.0
---------------
Timing Complex matrix operations for n = 200
Language     Answer Check            Time     Iter Time
O-O C++      (-1.06982e+07,-2.82516e+08): 18.61    0.7444
Hand-coded C (-1.06982e+07,-2.82516e+08): 13.93    0.5572

gcc version 3.0 20010611 (prerelease)
-------------------------------------
Timing Complex matrix operations for n = 200
Language     Answer Check            Time     Iter Time
O-O C++      (-1.06982e+07,-2.82516e+08): 15.2    0.608
Hand-coded C (-1.06982e+07,-2.82516e+08): 14.87    0.5948

Then, I learned that the problem is connected with the optimizations enabled by
-O2, and disappear when compiling with -O1, that is, 3.0pre and 3.0 performs
essentially in the same way:

---
-O1
---
gcc version 3.0
---------------
Timing Complex matrix operations for n = 200
Language     Answer Check            Time     Iter Time
O-O C++      (-1.06982e+07,-2.82516e+08): 21.78    0.8712
Hand-coded C (-1.06982e+07,-2.82516e+08): 13.12    0.5248

gcc version 3.0 20010611 (prerelease)
-------------------------------------
Timing Complex matrix operations for n = 200
Language     Answer Check            Time     Iter Time
O-O C++      (-1.06982e+07,-2.82516e+08): 21.03    0.8412
Hand-coded C (-1.06982e+07,-2.82516e+08): 12.36    0.4944

Therefore I proceeded to nail down the relevant source code section and its
assembly, using the -O2 switch for the builds.

The following is the function which is executed 25 times for the "O-O C++" raw
of the Haney benchmark, always with the same data (two fixed complex valued
square matrices, 200x200 in my reported results):

void cmatMul(ComplexMatrix &t, const ComplexMatrix &a, const ComplexMatrix &b)
{
  Integer i, j, k;
  const Integer M = a.dim(1), N = b.dim(2), K = b.dim(1);
  Complex temp;
  const Complex zero = Complex(0.0);

  memset(t, 0, M * N * sizeof(Complex));

  for (j = 1; j <= N; j++)
    {
      for (k = 1; k <= K; k++)
        {
          temp = b.index(k, j);
          if (temp != zero)
            {
              for (i = 1; i <= M; i++)
                t.index(i, j) += temp * a.index(i, k);
            }
        }
    }
}

This is a plain matrix multiplication, with the only interesting points of the
use of memset to fast zeroing the result matrix and the trick of checking for
zero entries in the first multiplier.
You will find attached to this message the two different assembly (for 3.0 and
3.0pre (20010611)) corresponding to it as output by "objdump -S",

Anyway, the following are the two inner loops (over k and, inside, over i):

---
3.0
---
 804b2c0: d9 45 c8              flds   0xffffffc8(%ebp)
 804b2c3: 8b 7d 10              mov    0x10(%ebp),%edi
 804b2c6: 8b 45 a4              mov    0xffffffa4(%ebp),%eax
 804b2c9: 8b 57 08              mov    0x8(%edi),%edx
 804b2cc: 48                    dec    %eax
 804b2cd: 0f af c2              imul   %edx,%eax
 804b2d0: 8b 57 04              mov    0x4(%edi),%edx
 804b2d3: 8d 04 30              lea    (%eax,%esi,1),%eax
 804b2d6: 8d 04 c2              lea    (%edx,%eax,8),%eax
 804b2d9: 31 d2                 xor    %edx,%edx
 804b2db: d9 40 f8              flds   0xfffffff8(%eax)
 804b2de: d9 55 b8              fsts   0xffffffb8(%ebp)
 804b2e1: d9 40 fc              flds   0xfffffffc(%eax)
 804b2e4: d9 c9                 fxch   %st(1)
 804b2e6: dd e2                 fucom  %st(2)
 804b2e8: df e0                 fnstsw %ax
 804b2ea: dd da                 fstp   %st(2)
 804b2ec: d9 c9                 fxch   %st(1)
 804b2ee: d9 5d d8              fstps  0xffffffd8(%ebp)
 804b2f1: 9e                    sahf
 804b2f2: d9 55 bc              fsts   0xffffffbc(%ebp)
 804b2f5: d9 55 dc              fsts   0xffffffdc(%ebp)
 804b2f8: 7a 17                 jp     804b311
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf1>
 804b2fa: 75 15                 jne    804b311
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf1>
 804b2fc: d9 45 cc              flds   0xffffffcc(%ebp)
 804b2ff: d9 c9                 fxch   %st(1)
 804b301: da e9                 fucompp
 804b303: df e0                 fnstsw %ax
 804b305: 9e                    sahf
 804b306: 7a 0b                 jp     804b313
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
 804b308: 75 09                 jne    804b313
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
 804b30a: ba 01 00 00 00        mov    $0x1,%edx
 804b30f: eb 02                 jmp    804b313
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
 804b311: dd d8                 fstp   %st(0)
 804b313: 85 d2                 test   %edx,%edx
 804b315: 0f 85 94 00 00 00     jne    804b3af
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x18f>
 804b31b: bb 01 00 00 00        mov    $0x1,%ebx
 804b320: 3b 5d a0              cmp    0xffffffa0(%ebp),%ebx
 804b323: 0f 8f 86 00 00 00     jg     804b3af
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x18f>
 804b329: 8b 55 8c              mov    0xffffff8c(%ebp),%edx
 804b32c: 8b 4d 90              mov    0xffffff90(%ebp),%ecx
 804b32f: 89 55 94              mov    %edx,0xffffff94(%ebp)
 804b332: 89 4d 84              mov    %ecx,0xffffff84(%ebp)
 804b335: 8d 74 26 00           lea    0x0(%esi,1),%esi
 804b339: 8d bc 27 00 00 00 00  lea    0x0(%edi,1),%edi

 804b340: d9 45 d8              flds   0xffffffd8(%ebp)
 804b343: 8b 7d 08              mov    0x8(%ebp),%edi
 804b346: 8b 4d 94              mov    0xffffff94(%ebp),%ecx
 804b349: d9 45 dc              flds   0xffffffdc(%ebp)
 804b34c: 8b 47 08              mov    0x8(%edi),%eax
 804b34f: d9 c1                 fld    %st(1)
 804b351: 8b 55 84              mov    0xffffff84(%ebp),%edx
 804b354: d9 c1                 fld    %st(1)
 804b356: 0f af c8              imul   %eax,%ecx
 804b359: 8b 47 04              mov    0x4(%edi),%eax
 804b35c: 8d 0c 19              lea    (%ecx,%ebx,1),%ecx
 804b35f: 8d 0c c8              lea    (%eax,%ecx,8),%ecx
 804b362: 8b 45 0c              mov    0xc(%ebp),%eax
 804b365: 8b 78 08              mov    0x8(%eax),%edi
 804b368: 8b 40 04              mov    0x4(%eax),%eax
 804b36b: 0f af d7              imul   %edi,%edx
 804b36e: 8d 14 1a              lea    (%edx,%ebx,1),%edx
 804b371: 43                    inc    %ebx
 804b372: 8d 14 d0              lea    (%eax,%edx,8),%edx
 804b375: d9 42 f8              flds   0xfffffff8(%edx)
 804b378: dc ca                 fmul   %st,%st(2)
 804b37a: dc cb                 fmul   %st,%st(3)
 804b37c: d9 5d a8              fstps  0xffffffa8(%ebp)
 804b37f: d9 42 fc              flds   0xfffffffc(%edx)
 804b382: dc c9                 fmul   %st,%st(1)
 804b384: dc cc                 fmul   %st,%st(4)
 804b386: d9 5d ac              fstps  0xffffffac(%ebp)
 804b389: de e9                 fsubrp %st,%st(1)
 804b38b: d9 ca                 fxch   %st(2)
 804b38d: de c1                 faddp  %st,%st(1)
 804b38f: d9 c9                 fxch   %st(1)
 804b391: d9 55 b8              fsts   0xffffffb8(%ebp)
 804b394: d9 c9                 fxch   %st(1)
 804b396: d9 5d bc              fstps  0xffffffbc(%ebp)
 804b399: d9 41 f8              flds   0xfffffff8(%ecx)
 804b39c: de c1                 faddp  %st,%st(1)
 804b39e: d9 59 f8              fstps  0xfffffff8(%ecx)
 804b3a1: d9 41 fc              flds   0xfffffffc(%ecx)
 804b3a4: d8 45 bc              fadds  0xffffffbc(%ebp)
 804b3a7: d9 59 fc              fstps  0xfffffffc(%ecx)
 804b3aa: 3b 5d a0              cmp    0xffffffa0(%ebp),%ebx
 804b3ad: 7e 91                 jle    804b340
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x120>

 804b3af: ff 45 90              incl   0xffffff90(%ebp)
 804b3b2: 46                    inc    %esi
 804b3b3: 3b 75 98              cmp    0xffffff98(%ebp),%esi
 804b3b6: 0f 8e 04 ff ff ff     jle    804b2c0
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xa0>

------
3.0pre
------
 804b2f0: d9 45 c8              flds   0xffffffc8(%ebp)
 804b2f3: 8b 7d 10              mov    0x10(%ebp),%edi
 804b2f6: 8b 85 74 ff ff ff     mov    0xffffff74(%ebp),%eax
 804b2fc: 8b 57 04              mov    0x4(%edi),%edx
 804b2ff: 01 d0                 add    %edx,%eax
 804b301: 31 d2                 xor    %edx,%edx
 804b303: d9 40 f8              flds   0xfffffff8(%eax)
 804b306: d9 55 b8              fsts   0xffffffb8(%ebp)
 804b309: d9 40 fc              flds   0xfffffffc(%eax)
 804b30c: d9 c9                 fxch   %st(1)
 804b30e: dd e2                 fucom  %st(2)
 804b310: df e0                 fnstsw %ax
 804b312: dd da                 fstp   %st(2)
 804b314: d9 c9                 fxch   %st(1)
 804b316: d9 5d d8              fstps  0xffffffd8(%ebp)
 804b319: 9e                    sahf
 804b31a: d9 55 bc              fsts   0xffffffbc(%ebp)
 804b31d: d9 55 dc              fsts   0xffffffdc(%ebp)
 804b320: 7a 17                 jp     804b339
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x119>
 804b322: 75 15                 jne    804b339
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x119>
 804b324: d9 45 cc              flds   0xffffffcc(%ebp)
 804b327: d9 c9                 fxch   %st(1)
 804b329: da e9                 fucompp
 804b32b: df e0                 fnstsw %ax
 804b32d: 9e                    sahf
 804b32e: 7a 0b                 jp     804b33b
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
 804b330: 75 09                 jne    804b33b
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
 804b332: ba 01 00 00 00        mov    $0x1,%edx
 804b337: eb 02                 jmp    804b33b
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
 804b339: dd d8                 fstp   %st(0)
 804b33b: 85 d2                 test   %edx,%edx
 804b33d: 0f 85 92 00 00 00     jne    804b3d5
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1b5>
 804b343: 8b 7d 9c              mov    0xffffff9c(%ebp),%edi
 804b346: 85 ff                 test   %edi,%edi
 804b348: 0f 8e 87 00 00 00     jle    804b3d5
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1b5>
 804b34e: 8b 7d 0c              mov    0xc(%ebp),%edi
 804b351: 8b 4d 08              mov    0x8(%ebp),%ecx
 804b354: 8b 45 88              mov    0xffffff88(%ebp),%eax
 804b357: 8b 5f 08              mov    0x8(%edi),%ebx
 804b35a: 8b 71 08              mov    0x8(%ecx),%esi
 804b35d: 8b 55 90              mov    0xffffff90(%ebp),%edx
 804b360: 0f af c3              imul   %ebx,%eax
 804b363: 0f af d6              imul   %esi,%edx
 804b366: 8d 34 c5 08 00 00 00  lea    0x8(,%eax,8),%esi
 804b36d: 8b 41 04              mov    0x4(%ecx),%eax
 804b370: 8b 4d 9c              mov    0xffffff9c(%ebp),%ecx
 804b373: 8d 1c d5 08 00 00 00  lea    0x8(,%edx,8),%ebx
 804b37a: 89 45 8c              mov    %eax,0xffffff8c(%ebp)
 804b37d: 8d 76 00              lea    0x0(%esi),%esi

 804b380: d9 45 d8              flds   0xffffffd8(%ebp)
 804b383: 89 f0                 mov    %esi,%eax
 804b385: 83 c6 08              add    $0x8,%esi
 804b388: d9 45 dc              flds   0xffffffdc(%ebp)
 804b38b: 8b 7d 0c              mov    0xc(%ebp),%edi
 804b38e: d9 c1                 fld    %st(1)
 804b390: d9 c1                 fld    %st(1)
 804b392: 8b 55 8c              mov    0xffffff8c(%ebp),%edx
 804b395: 03 47 04              add    0x4(%edi),%eax
 804b398: 01 da                 add    %ebx,%edx
 804b39a: 83 c3 08              add    $0x8,%ebx
 804b39d: 49                    dec    %ecx
 804b39e: d9 40 f8              flds   0xfffffff8(%eax)
 804b3a1: dc ca                 fmul   %st,%st(2)
 804b3a3: dc cb                 fmul   %st,%st(3)
 804b3a5: d9 5d a8              fstps  0xffffffa8(%ebp)
 804b3a8: d9 40 fc              flds   0xfffffffc(%eax)
 804b3ab: dc c9                 fmul   %st,%st(1)
 804b3ad: dc cc                 fmul   %st,%st(4)
 804b3af: d9 5d ac              fstps  0xffffffac(%ebp)
 804b3b2: de e9                 fsubrp %st,%st(1)
 804b3b4: d9 ca                 fxch   %st(2)
 804b3b6: de c1                 faddp  %st,%st(1)
 804b3b8: d9 c9                 fxch   %st(1)
 804b3ba: d9 55 b8              fsts   0xffffffb8(%ebp)
 804b3bd: d9 c9                 fxch   %st(1)
 804b3bf: d9 5d bc              fstps  0xffffffbc(%ebp)
 804b3c2: d9 42 f8              flds   0xfffffff8(%edx)
 804b3c5: de c1                 faddp  %st,%st(1)
 804b3c7: d9 5a f8              fstps  0xfffffff8(%edx)
 804b3ca: d9 42 fc              flds   0xfffffffc(%edx)
 804b3cd: d8 45 bc              fadds  0xffffffbc(%ebp)
 804b3d0: d9 5a fc              fstps  0xfffffffc(%edx)
 804b3d3: 75 ab                 jne    804b380
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x160>

 804b3d5: ff 45 a0              incl   0xffffffa0(%ebp)
 804b3d8: 8b 45 94              mov    0xffffff94(%ebp),%eax
 804b3db: ff 45 88              incl   0xffffff88(%ebp)
 804b3de: 83 85 74 ff ff ff 08  addl   $0x8,0xffffff74(%ebp)
 804b3e5: 39 45 a0              cmp    %eax,0xffffffa0(%ebp)
 804b3e8: 0f 8e 02 ff ff ff     jle    804b2f0
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xd0>


Indeed, in the innermost loop (that over i, for 3.0: 804b340-804b3ad, for
3.0pre: 804b3d3-804b380) there are clear differences, f.i., 8 mov for 3.0 vs
only 3 mov (1 between regs) for 3.0pre !!

But at this point my recollections from my basic university course of x86
assembler are running out and I definitely need some more help from the list to
understand more of this disappointing behavior of the released 3.0.

A final obervation: on my system current 3.1 snapshots performs on the test
similarly to 3.0, that is similarly to 2.95.x, that is worse :( than 3.0pre,
which consistently in the last three months showed C++ very close to Hand-coded
C...

Cheers,
Paolo.


// Complex

void cmatMul(ComplexMatrix &t, const ComplexMatrix &a, const ComplexMatrix &b)
{
 804b220:	55                   	push   %ebp
 804b221:	89 e5                	mov    %esp,%ebp
 804b223:	57                   	push   %edi
  const T &data() const { return *d; }

  Boolean ownData() const { return myData; }

  Integer dim(Integer i) const { return n[i - 1]; }
 804b224:	8b 45 0c             	mov    0xc(%ebp),%eax
 804b227:	56                   	push   %esi
  
  BaseArray<T> &operator=(const BaseArray<T> &a);

  operator const T *() const { return d; }
  operator T *() { return d; }
 804b228:	8b 55 08             	mov    0x8(%ebp),%edx
 804b22b:	53                   	push   %ebx
 804b22c:	83 c4 80             	add    $0xffffff80,%esp
  const T &data() const { return *d; }

  Boolean ownData() const { return myData; }

  Integer dim(Integer i) const { return n[i - 1]; }
 804b22f:	8b 40 08             	mov    0x8(%eax),%eax
 804b232:	89 45 a0             	mov    %eax,0xffffffa0(%ebp)
 804b235:	8b 45 10             	mov    0x10(%ebp),%eax
 804b238:	8b 40 0c             	mov    0xc(%eax),%eax
 804b23b:	89 45 9c             	mov    %eax,0xffffff9c(%ebp)
 804b23e:	8b 45 10             	mov    0x10(%ebp),%eax
 804b241:	8b 5d 9c             	mov    0xffffff9c(%ebp),%ebx
 804b244:	8b 40 08             	mov    0x8(%eax),%eax
    }
  BaseComplex(T rp, T ip = 0.0)
    {
      re = rp;
      im = ip;
 804b247:	c7 45 cc 00 00 00 00 	movl   $0x0,0xffffffcc(%ebp)
 804b24e:	c7 45 dc 00 00 00 00 	movl   $0x0,0xffffffdc(%ebp)
 804b255:	c7 45 d8 00 00 00 00 	movl   $0x0,0xffffffd8(%ebp)
  const T &data() const { return *d; }

  Boolean ownData() const { return myData; }

  Integer dim(Integer i) const { return n[i - 1]; }
 804b25c:	89 45 98             	mov    %eax,0xffffff98(%ebp)
 804b25f:	8b 45 a0             	mov    0xffffffa0(%ebp),%eax
      re = im = T(0.0);
    }
  BaseComplex(T rp, T ip = 0.0)
    {
      re = rp;
 804b262:	c7 45 c8 00 00 00 00 	movl   $0x0,0xffffffc8(%ebp)
  
  BaseArray<T> &operator=(const BaseArray<T> &a);

  operator const T *() const { return d; }
  operator T *() { return d; }
 804b269:	0f af c3             	imul   %ebx,%eax
 804b26c:	c1 e0 03             	shl    $0x3,%eax
 804b26f:	50                   	push   %eax
 804b270:	6a 00                	push   $0x0
 804b272:	8b 4a 04             	mov    0x4(%edx),%ecx
 804b275:	51                   	push   %ecx
 804b276:	e8 85 e5 ff ff       	call   8049800 <_init+0x268>
  Integer i, j, k;
  const Integer M = a.dim(1), N = b.dim(2), K = b.dim(1);
  Complex temp;
  const Complex zero = Complex(0.0);

  memset(t, 0, M * N * sizeof(Complex));
  
  for (j = 1; j <= N; j++)
 804b27b:	8b 4d 9c             	mov    0xffffff9c(%ebp),%ecx
 804b27e:	83 c4 10             	add    $0x10,%esp
 804b281:	c7 45 a4 01 00 00 00 	movl   $0x1,0xffffffa4(%ebp)
 804b288:	39 4d a4             	cmp    %ecx,0xffffffa4(%ebp)
 804b28b:	0f 8f 3d 01 00 00    	jg     804b3ce <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1ae>
 804b291:	c7 45 8c 00 00 00 00 	movl   $0x0,0xffffff8c(%ebp)
 804b298:	90                   	nop    
 804b299:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
    {
      for (k = 1; k <= K; k++)
 804b2a0:	be 01 00 00 00       	mov    $0x1,%esi
 804b2a5:	3b 75 98             	cmp    0xffffff98(%ebp),%esi
 804b2a8:	0f 8f 0e 01 00 00    	jg     804b3bc <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x19c>
 804b2ae:	c7 45 90 00 00 00 00 	movl   $0x0,0xffffff90(%ebp)
 804b2b5:	8d 74 26 00          	lea    0x0(%esi,1),%esi
 804b2b9:	8d bc 27 00 00 00 00 	lea    0x0(%edi,1),%edi
      im = c.im;
    }
  
  int operator==(const BaseComplex<T> &c) const
    {
 804b2c0:	d9 45 c8             	flds   0xffffffc8(%ebp)
 804b2c3:	8b 7d 10             	mov    0x10(%ebp),%edi
 804b2c6:	8b 45 a4             	mov    0xffffffa4(%ebp),%eax
 804b2c9:	8b 57 08             	mov    0x8(%edi),%edx
 804b2cc:	48                   	dec    %eax
 804b2cd:	0f af c2             	imul   %edx,%eax
 804b2d0:	8b 57 04             	mov    0x4(%edi),%edx
 804b2d3:	8d 04 30             	lea    (%eax,%esi,1),%eax
 804b2d6:	8d 04 c2             	lea    (%edx,%eax,8),%eax
 804b2d9:	31 d2                	xor    %edx,%edx
 804b2db:	d9 40 f8             	flds   0xfffffff8(%eax)
 804b2de:	d9 55 b8             	fsts   0xffffffb8(%ebp)
 804b2e1:	d9 40 fc             	flds   0xfffffffc(%eax)
 804b2e4:	d9 c9                	fxch   %st(1)
 804b2e6:	dd e2                	fucom  %st(2)
 804b2e8:	df e0                	fnstsw %ax
 804b2ea:	dd da                	fstp   %st(2)
 804b2ec:	d9 c9                	fxch   %st(1)
      return (re == c.re && im == c.im);
    }
  int operator!=(const BaseComplex<T> &c) const
    {
      return !(*this == c);
    }

  BaseComplex<T> invert() const
    {
      T normalize = (re * re)+(im * im);
      return BaseComplex<T>((re / normalize), (-im / normalize));
    }
  BaseComplex<T> operator-() const
    {
      return BaseComplex<T>(-re, -im);
    }
  BaseComplex<T> conj() const
    {
      return BaseComplex<T>(re, -im);
    }
  int operator!() const
    {
      return ((re == 0.0) ? 1 : 0);
    }

  BaseComplex<T>& operator=(const BaseComplex<T> &c)
    {
      re = c.re;
 804b2ee:	d9 5d d8             	fstps  0xffffffd8(%ebp)
 804b2f1:	9e                   	sahf   
 804b2f2:	d9 55 bc             	fsts   0xffffffbc(%ebp)
      im = c.im;
 804b2f5:	d9 55 dc             	fsts   0xffffffdc(%ebp)
 804b2f8:	7a 17                	jp     804b311 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf1>
 804b2fa:	75 15                	jne    804b311 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf1>
 804b2fc:	d9 45 cc             	flds   0xffffffcc(%ebp)
 804b2ff:	d9 c9                	fxch   %st(1)
 804b301:	da e9                	fucompp 
 804b303:	df e0                	fnstsw %ax
 804b305:	9e                   	sahf   
 804b306:	7a 0b                	jp     804b313 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
 804b308:	75 09                	jne    804b313 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
 804b30a:	ba 01 00 00 00       	mov    $0x1,%edx
 804b30f:	eb 02                	jmp    804b313 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
 804b311:	dd d8                	fstp   %st(0)
 804b313:	85 d2                	test   %edx,%edx
 804b315:	0f 85 94 00 00 00    	jne    804b3af <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x18f>
        {
          temp = b.index(k, j);
          if (temp != zero)
            {
              for (i = 1; i <= M; i++)
 804b31b:	bb 01 00 00 00       	mov    $0x1,%ebx
 804b320:	3b 5d a0             	cmp    0xffffffa0(%ebp),%ebx
 804b323:	0f 8f 86 00 00 00    	jg     804b3af <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x18f>
 804b329:	8b 55 8c             	mov    0xffffff8c(%ebp),%edx
 804b32c:	8b 4d 90             	mov    0xffffff90(%ebp),%ecx
 804b32f:	89 55 94             	mov    %edx,0xffffff94(%ebp)
 804b332:	89 4d 84             	mov    %ecx,0xffffff84(%ebp)
 804b335:	8d 74 26 00          	lea    0x0(%esi,1),%esi
 804b339:	8d bc 27 00 00 00 00 	lea    0x0(%edi,1),%edi
    {
      re = im = T(0.0);
    }
  BaseComplex(T rp, T ip = 0.0)
    {
 804b340:	d9 45 d8             	flds   0xffffffd8(%ebp)
      return d[i - b[0] + n[0] * (j - b[1])];
    }

  Complex &index(Integer i, Integer j)
    {
 804b343:	8b 7d 08             	mov    0x8(%ebp),%edi
 804b346:	8b 4d 94             	mov    0xffffff94(%ebp),%ecx
    {
      re = im = T(0.0);
    }
  BaseComplex(T rp, T ip = 0.0)
    {
 804b349:	d9 45 dc             	flds   0xffffffdc(%ebp)
      return d[i - b[0] + n[0] * (j - b[1])];
    }

  Complex &index(Integer i, Integer j)
    {
 804b34c:	8b 47 08             	mov    0x8(%edi),%eax
    {
      re = im = T(0.0);
    }
  BaseComplex(T rp, T ip = 0.0)
    {
 804b34f:	d9 c1                	fld    %st(1)
      re = rp;
      im = ip;
    }
  BaseComplex(const BaseComplex<T> &c)
    {
 804b351:	8b 55 84             	mov    0xffffff84(%ebp),%edx
 804b354:	d9 c1                	fld    %st(1)
      return d[i - b[0] + n[0] * (j - b[1])];
    }

  Complex &index(Integer i, Integer j)
    {
 804b356:	0f af c8             	imul   %eax,%ecx
 804b359:	8b 47 04             	mov    0x4(%edi),%eax
 804b35c:	8d 0c 19             	lea    (%ecx,%ebx,1),%ecx
 804b35f:	8d 0c c8             	lea    (%eax,%ecx,8),%ecx
      re = rp;
      im = ip;
    }
  BaseComplex(const BaseComplex<T> &c)
    {
 804b362:	8b 45 0c             	mov    0xc(%ebp),%eax
 804b365:	8b 78 08             	mov    0x8(%eax),%edi
 804b368:	8b 40 04             	mov    0x4(%eax),%eax
 804b36b:	0f af d7             	imul   %edi,%edx
 804b36e:	8d 14 1a             	lea    (%edx,%ebx,1),%edx
 804b371:	43                   	inc    %ebx
      re = rp;
      im = ip;
    }
  BaseComplex(const BaseComplex<T> &c)
    {
 804b372:	8d 14 d0             	lea    (%eax,%edx,8),%edx
      re = c.re;
 804b375:	d9 42 f8             	flds   0xfffffff8(%edx)
 804b378:	dc ca                	fmul   %st,%st(2)
 804b37a:	dc cb                	fmul   %st,%st(3)
 804b37c:	d9 5d a8             	fstps  0xffffffa8(%ebp)
      im = c.im;
 804b37f:	d9 42 fc             	flds   0xfffffffc(%edx)
 804b382:	dc c9                	fmul   %st,%st(1)
 804b384:	dc cc                	fmul   %st,%st(4)
 804b386:	d9 5d ac             	fstps  0xffffffac(%ebp)
 804b389:	de e9                	fsubrp %st,%st(1)
 804b38b:	d9 ca                	fxch   %st(2)
 804b38d:	de c1                	faddp  %st,%st(1)
 804b38f:	d9 c9                	fxch   %st(1)
 804b391:	d9 55 b8             	fsts   0xffffffb8(%ebp)
 804b394:	d9 c9                	fxch   %st(1)
 804b396:	d9 5d bc             	fstps  0xffffffbc(%ebp)
    }
  
  int operator==(const BaseComplex<T> &c) const
    {
      return (re == c.re && im == c.im);
    }
  int operator!=(const BaseComplex<T> &c) const
    {
      return !(*this == c);
    }

  BaseComplex<T> invert() const
    {
      T normalize = (re * re)+(im * im);
      return BaseComplex<T>((re / normalize), (-im / normalize));
    }
  BaseComplex<T> operator-() const
    {
      return BaseComplex<T>(-re, -im);
    }
  BaseComplex<T> conj() const
    {
      return BaseComplex<T>(re, -im);
    }
  int operator!() const
    {
      return ((re == 0.0) ? 1 : 0);
    }

  BaseComplex<T>& operator=(const BaseComplex<T> &c)
    {
      re = c.re;
      im = c.im;
      return *this;
    }
  void operator+=(const BaseComplex<T> &c)
    {
      re += c.re;
 804b399:	d9 41 f8             	flds   0xfffffff8(%ecx)
 804b39c:	de c1                	faddp  %st,%st(1)
 804b39e:	d9 59 f8             	fstps  0xfffffff8(%ecx)
      im += c.im;
 804b3a1:	d9 41 fc             	flds   0xfffffffc(%ecx)
 804b3a4:	d8 45 bc             	fadds  0xffffffbc(%ebp)
 804b3a7:	d9 59 fc             	fstps  0xfffffffc(%ecx)
 804b3aa:	3b 5d a0             	cmp    0xffffffa0(%ebp),%ebx
 804b3ad:	7e 91                	jle    804b340 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x120>
 804b3af:	ff 45 90             	incl   0xffffff90(%ebp)
 804b3b2:	46                   	inc    %esi
 804b3b3:	3b 75 98             	cmp    0xffffff98(%ebp),%esi
 804b3b6:	0f 8e 04 ff ff ff    	jle    804b2c0 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xa0>
 804b3bc:	ff 45 a4             	incl   0xffffffa4(%ebp)
 804b3bf:	8b 55 9c             	mov    0xffffff9c(%ebp),%edx
 804b3c2:	ff 45 8c             	incl   0xffffff8c(%ebp)
 804b3c5:	39 55 a4             	cmp    %edx,0xffffffa4(%ebp)
 804b3c8:	0f 8e d2 fe ff ff    	jle    804b2a0 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x80>
                t.index(i, j) += temp * a.index(i, k);
            }
        }
    }
}
 804b3ce:	8d 65 f4             	lea    0xfffffff4(%ebp),%esp
 804b3d1:	5b                   	pop    %ebx
 804b3d2:	5e                   	pop    %esi
 804b3d3:	5f                   	pop    %edi
 804b3d4:	5d                   	pop    %ebp
 804b3d5:	c3                   	ret    



// Complex

void cmatMul(ComplexMatrix &t, const ComplexMatrix &a, const ComplexMatrix &b)
{
 804b220:	55                   	push   %ebp
 804b221:	89 e5                	mov    %esp,%ebp
 804b223:	57                   	push   %edi
 804b224:	56                   	push   %esi
 804b225:	53                   	push   %ebx
 804b226:	81 ec 90 00 00 00    	sub    $0x90,%esp
  
  BaseArray<T> &operator=(const BaseArray<T> &a);

  operator const T *() const { return d; }
  operator T *() { return d; }
 804b22c:	8b 55 08             	mov    0x8(%ebp),%edx
    }
  BaseComplex(T rp, T ip = 0.0)
    {
      re = rp;
      im = ip;
 804b22f:	c7 45 cc 00 00 00 00 	movl   $0x0,0xffffffcc(%ebp)
  const T &data() const { return *d; }

  Boolean ownData() const { return myData; }

  Integer dim(Integer i) const { return n[i - 1]; }
 804b236:	8b 45 0c             	mov    0xc(%ebp),%eax
  T im;

  BaseComplex()
    {
      re = im = T(0.0);
 804b239:	c7 45 dc 00 00 00 00 	movl   $0x0,0xffffffdc(%ebp)
 804b240:	c7 45 d8 00 00 00 00 	movl   $0x0,0xffffffd8(%ebp)
  const T &data() const { return *d; }

  Boolean ownData() const { return myData; }

  Integer dim(Integer i) const { return n[i - 1]; }
 804b247:	8b 40 08             	mov    0x8(%eax),%eax
      re = im = T(0.0);
    }
  BaseComplex(T rp, T ip = 0.0)
    {
      re = rp;
 804b24a:	c7 45 c8 00 00 00 00 	movl   $0x0,0xffffffc8(%ebp)
  const T &data() const { return *d; }

  Boolean ownData() const { return myData; }

  Integer dim(Integer i) const { return n[i - 1]; }
 804b251:	89 45 9c             	mov    %eax,0xffffff9c(%ebp)
 804b254:	8b 45 10             	mov    0x10(%ebp),%eax
 804b257:	8b 40 0c             	mov    0xc(%eax),%eax
 804b25a:	89 45 98             	mov    %eax,0xffffff98(%ebp)
 804b25d:	8b 45 10             	mov    0x10(%ebp),%eax
 804b260:	8b 75 98             	mov    0xffffff98(%ebp),%esi
 804b263:	8b 40 08             	mov    0x8(%eax),%eax
 804b266:	89 45 94             	mov    %eax,0xffffff94(%ebp)
 804b269:	8b 45 9c             	mov    0xffffff9c(%ebp),%eax
 804b26c:	0f af c6             	imul   %esi,%eax
 804b26f:	c1 e0 03             	shl    $0x3,%eax
 804b272:	50                   	push   %eax
 804b273:	6a 00                	push   $0x0
 804b275:	8b 5a 04             	mov    0x4(%edx),%ebx
 804b278:	53                   	push   %ebx
 804b279:	e8 82 e5 ff ff       	call   8049800 <_init+0x268>
  Integer i, j, k;
  const Integer M = a.dim(1), N = b.dim(2), K = b.dim(1);
  Complex temp;
  const Complex zero = Complex(0.0);

  memset(t, 0, M * N * sizeof(Complex));
  
  for (j = 1; j <= N; j++)
 804b27e:	8b 4d 98             	mov    0xffffff98(%ebp),%ecx
 804b281:	83 c4 10             	add    $0x10,%esp
 804b284:	c7 45 a4 01 00 00 00 	movl   $0x1,0xffffffa4(%ebp)
 804b28b:	39 4d a4             	cmp    %ecx,0xffffffa4(%ebp)
 804b28e:	0f 8f 6f 01 00 00    	jg     804b403 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1e3>
 804b294:	c7 45 84 00 00 00 00 	movl   $0x0,0xffffff84(%ebp)
 804b29b:	c7 45 80 00 00 00 00 	movl   $0x0,0xffffff80(%ebp)
 804b2a2:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
 804b2a9:	8d bc 27 00 00 00 00 	lea    0x0(%edi,1),%edi
    {
      for (k = 1; k <= K; k++)
 804b2b0:	c7 45 a0 01 00 00 00 	movl   $0x1,0xffffffa0(%ebp)
 804b2b7:	8b 7d 94             	mov    0xffffff94(%ebp),%edi
 804b2ba:	39 7d a0             	cmp    %edi,0xffffffa0(%ebp)
 804b2bd:	0f 8f 2b 01 00 00    	jg     804b3ee <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1ce>
 804b2c3:	8b 55 10             	mov    0x10(%ebp),%edx
 804b2c6:	8b 45 80             	mov    0xffffff80(%ebp),%eax
 804b2c9:	8b 4a 08             	mov    0x8(%edx),%ecx
 804b2cc:	c7 45 88 00 00 00 00 	movl   $0x0,0xffffff88(%ebp)
 804b2d3:	0f af c1             	imul   %ecx,%eax
 804b2d6:	8b 4d 84             	mov    0xffffff84(%ebp),%ecx
 804b2d9:	89 4d 90             	mov    %ecx,0xffffff90(%ebp)
 804b2dc:	8d 04 c5 08 00 00 00 	lea    0x8(,%eax,8),%eax
 804b2e3:	89 85 74 ff ff ff    	mov    %eax,0xffffff74(%ebp)
 804b2e9:	8d b4 26 00 00 00 00 	lea    0x0(%esi,1),%esi
      im = c.im;
    }
  
  int operator==(const BaseComplex<T> &c) const
    {
 804b2f0:	d9 45 c8             	flds   0xffffffc8(%ebp)
 804b2f3:	8b 7d 10             	mov    0x10(%ebp),%edi
 804b2f6:	8b 85 74 ff ff ff    	mov    0xffffff74(%ebp),%eax
 804b2fc:	8b 57 04             	mov    0x4(%edi),%edx
 804b2ff:	01 d0                	add    %edx,%eax
 804b301:	31 d2                	xor    %edx,%edx
 804b303:	d9 40 f8             	flds   0xfffffff8(%eax)
 804b306:	d9 55 b8             	fsts   0xffffffb8(%ebp)
 804b309:	d9 40 fc             	flds   0xfffffffc(%eax)
 804b30c:	d9 c9                	fxch   %st(1)
 804b30e:	dd e2                	fucom  %st(2)
 804b310:	df e0                	fnstsw %ax
 804b312:	dd da                	fstp   %st(2)
 804b314:	d9 c9                	fxch   %st(1)
      return (re == c.re && im == c.im);
    }
  int operator!=(const BaseComplex<T> &c) const
    {
      return !(*this == c);
    }

  BaseComplex<T> invert() const
    {
      T normalize = (re * re)+(im * im);
      return BaseComplex<T>((re / normalize), (-im / normalize));
    }
  BaseComplex<T> operator-() const
    {
      return BaseComplex<T>(-re, -im);
    }
  BaseComplex<T> conj() const
    {
      return BaseComplex<T>(re, -im);
    }
  int operator!() const
    {
      return ((re == 0.0) ? 1 : 0);
    }

  BaseComplex<T>& operator=(const BaseComplex<T> &c)
    {
      re = c.re;
 804b316:	d9 5d d8             	fstps  0xffffffd8(%ebp)
 804b319:	9e                   	sahf   
 804b31a:	d9 55 bc             	fsts   0xffffffbc(%ebp)
      im = c.im;
 804b31d:	d9 55 dc             	fsts   0xffffffdc(%ebp)
 804b320:	7a 17                	jp     804b339 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x119>
 804b322:	75 15                	jne    804b339 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x119>
 804b324:	d9 45 cc             	flds   0xffffffcc(%ebp)
 804b327:	d9 c9                	fxch   %st(1)
 804b329:	da e9                	fucompp 
 804b32b:	df e0                	fnstsw %ax
 804b32d:	9e                   	sahf   
 804b32e:	7a 0b                	jp     804b33b <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
 804b330:	75 09                	jne    804b33b <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
 804b332:	ba 01 00 00 00       	mov    $0x1,%edx
 804b337:	eb 02                	jmp    804b33b <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
 804b339:	dd d8                	fstp   %st(0)
 804b33b:	85 d2                	test   %edx,%edx
 804b33d:	0f 85 92 00 00 00    	jne    804b3d5 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1b5>
        {
          temp = b.index(k, j);
          if (temp != zero)
            {
              for (i = 1; i <= M; i++)
 804b343:	8b 7d 9c             	mov    0xffffff9c(%ebp),%edi
 804b346:	85 ff                	test   %edi,%edi
 804b348:	0f 8e 87 00 00 00    	jle    804b3d5 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1b5>
 804b34e:	8b 7d 0c             	mov    0xc(%ebp),%edi
 804b351:	8b 4d 08             	mov    0x8(%ebp),%ecx
 804b354:	8b 45 88             	mov    0xffffff88(%ebp),%eax
 804b357:	8b 5f 08             	mov    0x8(%edi),%ebx
 804b35a:	8b 71 08             	mov    0x8(%ecx),%esi
 804b35d:	8b 55 90             	mov    0xffffff90(%ebp),%edx
 804b360:	0f af c3             	imul   %ebx,%eax
 804b363:	0f af d6             	imul   %esi,%edx
 804b366:	8d 34 c5 08 00 00 00 	lea    0x8(,%eax,8),%esi
 804b36d:	8b 41 04             	mov    0x4(%ecx),%eax
 804b370:	8b 4d 9c             	mov    0xffffff9c(%ebp),%ecx
 804b373:	8d 1c d5 08 00 00 00 	lea    0x8(,%edx,8),%ebx
 804b37a:	89 45 8c             	mov    %eax,0xffffff8c(%ebp)
 804b37d:	8d 76 00             	lea    0x0(%esi),%esi
    {
      re = im = T(0.0);
    }
  BaseComplex(T rp, T ip = 0.0)
    {
 804b380:	d9 45 d8             	flds   0xffffffd8(%ebp)
      re = rp;
      im = ip;
    }
  BaseComplex(const BaseComplex<T> &c)
    {
 804b383:	89 f0                	mov    %esi,%eax
 804b385:	83 c6 08             	add    $0x8,%esi
    {
      re = im = T(0.0);
    }
  BaseComplex(T rp, T ip = 0.0)
    {
 804b388:	d9 45 dc             	flds   0xffffffdc(%ebp)
      re = rp;
      im = ip;
    }
  BaseComplex(const BaseComplex<T> &c)
    {
 804b38b:	8b 7d 0c             	mov    0xc(%ebp),%edi
 804b38e:	d9 c1                	fld    %st(1)
 804b390:	d9 c1                	fld    %st(1)
      return d[i - b[0] + n[0] * (j - b[1])];
    }

  Complex &index(Integer i, Integer j)
    {
 804b392:	8b 55 8c             	mov    0xffffff8c(%ebp),%edx
      re = rp;
      im = ip;
    }
  BaseComplex(const BaseComplex<T> &c)
    {
 804b395:	03 47 04             	add    0x4(%edi),%eax
      return d[i - b[0] + n[0] * (j - b[1])];
    }

  Complex &index(Integer i, Integer j)
    {
 804b398:	01 da                	add    %ebx,%edx
 804b39a:	83 c3 08             	add    $0x8,%ebx
 804b39d:	49                   	dec    %ecx
      im = ip;
    }
  BaseComplex(const BaseComplex<T> &c)
    {
      re = c.re;
 804b39e:	d9 40 f8             	flds   0xfffffff8(%eax)
 804b3a1:	dc ca                	fmul   %st,%st(2)
 804b3a3:	dc cb                	fmul   %st,%st(3)
 804b3a5:	d9 5d a8             	fstps  0xffffffa8(%ebp)
      im = c.im;
 804b3a8:	d9 40 fc             	flds   0xfffffffc(%eax)
 804b3ab:	dc c9                	fmul   %st,%st(1)
 804b3ad:	dc cc                	fmul   %st,%st(4)
 804b3af:	d9 5d ac             	fstps  0xffffffac(%ebp)
 804b3b2:	de e9                	fsubrp %st,%st(1)
 804b3b4:	d9 ca                	fxch   %st(2)
 804b3b6:	de c1                	faddp  %st,%st(1)
 804b3b8:	d9 c9                	fxch   %st(1)
 804b3ba:	d9 55 b8             	fsts   0xffffffb8(%ebp)
 804b3bd:	d9 c9                	fxch   %st(1)
 804b3bf:	d9 5d bc             	fstps  0xffffffbc(%ebp)
    }
  
  int operator==(const BaseComplex<T> &c) const
    {
      return (re == c.re && im == c.im);
    }
  int operator!=(const BaseComplex<T> &c) const
    {
      return !(*this == c);
    }

  BaseComplex<T> invert() const
    {
      T normalize = (re * re)+(im * im);
      return BaseComplex<T>((re / normalize), (-im / normalize));
    }
  BaseComplex<T> operator-() const
    {
      return BaseComplex<T>(-re, -im);
    }
  BaseComplex<T> conj() const
    {
      return BaseComplex<T>(re, -im);
    }
  int operator!() const
    {
      return ((re == 0.0) ? 1 : 0);
    }

  BaseComplex<T>& operator=(const BaseComplex<T> &c)
    {
      re = c.re;
      im = c.im;
      return *this;
    }
  void operator+=(const BaseComplex<T> &c)
    {
      re += c.re;
 804b3c2:	d9 42 f8             	flds   0xfffffff8(%edx)
 804b3c5:	de c1                	faddp  %st,%st(1)
 804b3c7:	d9 5a f8             	fstps  0xfffffff8(%edx)
      im += c.im;
 804b3ca:	d9 42 fc             	flds   0xfffffffc(%edx)
 804b3cd:	d8 45 bc             	fadds  0xffffffbc(%ebp)
 804b3d0:	d9 5a fc             	fstps  0xfffffffc(%edx)
 804b3d3:	75 ab                	jne    804b380 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x160>
 804b3d5:	ff 45 a0             	incl   0xffffffa0(%ebp)
 804b3d8:	8b 45 94             	mov    0xffffff94(%ebp),%eax
 804b3db:	ff 45 88             	incl   0xffffff88(%ebp)
 804b3de:	83 85 74 ff ff ff 08 	addl   $0x8,0xffffff74(%ebp)
 804b3e5:	39 45 a0             	cmp    %eax,0xffffffa0(%ebp)
 804b3e8:	0f 8e 02 ff ff ff    	jle    804b2f0 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xd0>
 804b3ee:	ff 45 a4             	incl   0xffffffa4(%ebp)
 804b3f1:	8b 55 98             	mov    0xffffff98(%ebp),%edx
 804b3f4:	ff 45 84             	incl   0xffffff84(%ebp)
 804b3f7:	ff 45 80             	incl   0xffffff80(%ebp)
 804b3fa:	39 55 a4             	cmp    %edx,0xffffffa4(%ebp)
 804b3fd:	0f 8e ad fe ff ff    	jle    804b2b0 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x90>
                t.index(i, j) += temp * a.index(i, k);
            }
        }
    }
}
 804b403:	8d 65 f4             	lea    0xfffffff4(%ebp),%esp
 804b406:	5b                   	pop    %ebx
 804b407:	5e                   	pop    %esi
 804b408:	5f                   	pop    %edi
 804b409:	5d                   	pop    %ebp
 804b40a:	c3                   	ret    


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]