This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: 3.0-pre vs 3.0 on Haney speed
- To: Andi Kleen <ak at suse dot de>
- Subject: Re: 3.0-pre vs 3.0 on Haney speed
- From: Paolo Carlini <pcarlini at unitus dot it>
- Date: Sun, 24 Jun 2001 23:37:41 +0200
- CC: gcc at gcc dot gnu dot org, jbuck at synopsys dot com, dnovillo at redhat dot com
- Organization: Universita' della Tuscia
- References: <200106232232.PAA29219@racerx.synopsys.com.suse.lists.egcs> <3B352498.50616F37@unitus.it.suse.lists.egcs> <oupk8223ryz.fsf@pigdrop.muc.suse.de>
- Reply-To: pcarlini at unitus dot it
Hi again,
thanks to some nice tips from Andi (and the encourgement of Joe) I think I have
made some progress in the analysis of the annoying slowdown which I reported
some days ago.
First, I verified that the problem is not with the loop unrolling optimization:
I ran again the test and obtained essentially the same behavior (modulo
statistical fluctuations, as usual):
---
-O2
---
gcc version 3.0
---------------
Timing Complex matrix operations for n = 200
Language Answer Check Time Iter Time
O-O C++ (-1.06982e+07,-2.82516e+08): 18.61 0.7444
Hand-coded C (-1.06982e+07,-2.82516e+08): 13.93 0.5572
gcc version 3.0 20010611 (prerelease)
-------------------------------------
Timing Complex matrix operations for n = 200
Language Answer Check Time Iter Time
O-O C++ (-1.06982e+07,-2.82516e+08): 15.2 0.608
Hand-coded C (-1.06982e+07,-2.82516e+08): 14.87 0.5948
Then, I learned that the problem is connected with the optimizations enabled by
-O2, and disappear when compiling with -O1, that is, 3.0pre and 3.0 performs
essentially in the same way:
---
-O1
---
gcc version 3.0
---------------
Timing Complex matrix operations for n = 200
Language Answer Check Time Iter Time
O-O C++ (-1.06982e+07,-2.82516e+08): 21.78 0.8712
Hand-coded C (-1.06982e+07,-2.82516e+08): 13.12 0.5248
gcc version 3.0 20010611 (prerelease)
-------------------------------------
Timing Complex matrix operations for n = 200
Language Answer Check Time Iter Time
O-O C++ (-1.06982e+07,-2.82516e+08): 21.03 0.8412
Hand-coded C (-1.06982e+07,-2.82516e+08): 12.36 0.4944
Therefore I proceeded to nail down the relevant source code section and its
assembly, using the -O2 switch for the builds.
The following is the function which is executed 25 times for the "O-O C++" raw
of the Haney benchmark, always with the same data (two fixed complex valued
square matrices, 200x200 in my reported results):
void cmatMul(ComplexMatrix &t, const ComplexMatrix &a, const ComplexMatrix &b)
{
Integer i, j, k;
const Integer M = a.dim(1), N = b.dim(2), K = b.dim(1);
Complex temp;
const Complex zero = Complex(0.0);
memset(t, 0, M * N * sizeof(Complex));
for (j = 1; j <= N; j++)
{
for (k = 1; k <= K; k++)
{
temp = b.index(k, j);
if (temp != zero)
{
for (i = 1; i <= M; i++)
t.index(i, j) += temp * a.index(i, k);
}
}
}
}
This is a plain matrix multiplication, with the only interesting points of the
use of memset to fast zeroing the result matrix and the trick of checking for
zero entries in the first multiplier.
You will find attached to this message the two different assembly (for 3.0 and
3.0pre (20010611)) corresponding to it as output by "objdump -S",
Anyway, the following are the two inner loops (over k and, inside, over i):
---
3.0
---
804b2c0: d9 45 c8 flds 0xffffffc8(%ebp)
804b2c3: 8b 7d 10 mov 0x10(%ebp),%edi
804b2c6: 8b 45 a4 mov 0xffffffa4(%ebp),%eax
804b2c9: 8b 57 08 mov 0x8(%edi),%edx
804b2cc: 48 dec %eax
804b2cd: 0f af c2 imul %edx,%eax
804b2d0: 8b 57 04 mov 0x4(%edi),%edx
804b2d3: 8d 04 30 lea (%eax,%esi,1),%eax
804b2d6: 8d 04 c2 lea (%edx,%eax,8),%eax
804b2d9: 31 d2 xor %edx,%edx
804b2db: d9 40 f8 flds 0xfffffff8(%eax)
804b2de: d9 55 b8 fsts 0xffffffb8(%ebp)
804b2e1: d9 40 fc flds 0xfffffffc(%eax)
804b2e4: d9 c9 fxch %st(1)
804b2e6: dd e2 fucom %st(2)
804b2e8: df e0 fnstsw %ax
804b2ea: dd da fstp %st(2)
804b2ec: d9 c9 fxch %st(1)
804b2ee: d9 5d d8 fstps 0xffffffd8(%ebp)
804b2f1: 9e sahf
804b2f2: d9 55 bc fsts 0xffffffbc(%ebp)
804b2f5: d9 55 dc fsts 0xffffffdc(%ebp)
804b2f8: 7a 17 jp 804b311
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf1>
804b2fa: 75 15 jne 804b311
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf1>
804b2fc: d9 45 cc flds 0xffffffcc(%ebp)
804b2ff: d9 c9 fxch %st(1)
804b301: da e9 fucompp
804b303: df e0 fnstsw %ax
804b305: 9e sahf
804b306: 7a 0b jp 804b313
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
804b308: 75 09 jne 804b313
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
804b30a: ba 01 00 00 00 mov $0x1,%edx
804b30f: eb 02 jmp 804b313
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
804b311: dd d8 fstp %st(0)
804b313: 85 d2 test %edx,%edx
804b315: 0f 85 94 00 00 00 jne 804b3af
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x18f>
804b31b: bb 01 00 00 00 mov $0x1,%ebx
804b320: 3b 5d a0 cmp 0xffffffa0(%ebp),%ebx
804b323: 0f 8f 86 00 00 00 jg 804b3af
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x18f>
804b329: 8b 55 8c mov 0xffffff8c(%ebp),%edx
804b32c: 8b 4d 90 mov 0xffffff90(%ebp),%ecx
804b32f: 89 55 94 mov %edx,0xffffff94(%ebp)
804b332: 89 4d 84 mov %ecx,0xffffff84(%ebp)
804b335: 8d 74 26 00 lea 0x0(%esi,1),%esi
804b339: 8d bc 27 00 00 00 00 lea 0x0(%edi,1),%edi
804b340: d9 45 d8 flds 0xffffffd8(%ebp)
804b343: 8b 7d 08 mov 0x8(%ebp),%edi
804b346: 8b 4d 94 mov 0xffffff94(%ebp),%ecx
804b349: d9 45 dc flds 0xffffffdc(%ebp)
804b34c: 8b 47 08 mov 0x8(%edi),%eax
804b34f: d9 c1 fld %st(1)
804b351: 8b 55 84 mov 0xffffff84(%ebp),%edx
804b354: d9 c1 fld %st(1)
804b356: 0f af c8 imul %eax,%ecx
804b359: 8b 47 04 mov 0x4(%edi),%eax
804b35c: 8d 0c 19 lea (%ecx,%ebx,1),%ecx
804b35f: 8d 0c c8 lea (%eax,%ecx,8),%ecx
804b362: 8b 45 0c mov 0xc(%ebp),%eax
804b365: 8b 78 08 mov 0x8(%eax),%edi
804b368: 8b 40 04 mov 0x4(%eax),%eax
804b36b: 0f af d7 imul %edi,%edx
804b36e: 8d 14 1a lea (%edx,%ebx,1),%edx
804b371: 43 inc %ebx
804b372: 8d 14 d0 lea (%eax,%edx,8),%edx
804b375: d9 42 f8 flds 0xfffffff8(%edx)
804b378: dc ca fmul %st,%st(2)
804b37a: dc cb fmul %st,%st(3)
804b37c: d9 5d a8 fstps 0xffffffa8(%ebp)
804b37f: d9 42 fc flds 0xfffffffc(%edx)
804b382: dc c9 fmul %st,%st(1)
804b384: dc cc fmul %st,%st(4)
804b386: d9 5d ac fstps 0xffffffac(%ebp)
804b389: de e9 fsubrp %st,%st(1)
804b38b: d9 ca fxch %st(2)
804b38d: de c1 faddp %st,%st(1)
804b38f: d9 c9 fxch %st(1)
804b391: d9 55 b8 fsts 0xffffffb8(%ebp)
804b394: d9 c9 fxch %st(1)
804b396: d9 5d bc fstps 0xffffffbc(%ebp)
804b399: d9 41 f8 flds 0xfffffff8(%ecx)
804b39c: de c1 faddp %st,%st(1)
804b39e: d9 59 f8 fstps 0xfffffff8(%ecx)
804b3a1: d9 41 fc flds 0xfffffffc(%ecx)
804b3a4: d8 45 bc fadds 0xffffffbc(%ebp)
804b3a7: d9 59 fc fstps 0xfffffffc(%ecx)
804b3aa: 3b 5d a0 cmp 0xffffffa0(%ebp),%ebx
804b3ad: 7e 91 jle 804b340
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x120>
804b3af: ff 45 90 incl 0xffffff90(%ebp)
804b3b2: 46 inc %esi
804b3b3: 3b 75 98 cmp 0xffffff98(%ebp),%esi
804b3b6: 0f 8e 04 ff ff ff jle 804b2c0
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xa0>
------
3.0pre
------
804b2f0: d9 45 c8 flds 0xffffffc8(%ebp)
804b2f3: 8b 7d 10 mov 0x10(%ebp),%edi
804b2f6: 8b 85 74 ff ff ff mov 0xffffff74(%ebp),%eax
804b2fc: 8b 57 04 mov 0x4(%edi),%edx
804b2ff: 01 d0 add %edx,%eax
804b301: 31 d2 xor %edx,%edx
804b303: d9 40 f8 flds 0xfffffff8(%eax)
804b306: d9 55 b8 fsts 0xffffffb8(%ebp)
804b309: d9 40 fc flds 0xfffffffc(%eax)
804b30c: d9 c9 fxch %st(1)
804b30e: dd e2 fucom %st(2)
804b310: df e0 fnstsw %ax
804b312: dd da fstp %st(2)
804b314: d9 c9 fxch %st(1)
804b316: d9 5d d8 fstps 0xffffffd8(%ebp)
804b319: 9e sahf
804b31a: d9 55 bc fsts 0xffffffbc(%ebp)
804b31d: d9 55 dc fsts 0xffffffdc(%ebp)
804b320: 7a 17 jp 804b339
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x119>
804b322: 75 15 jne 804b339
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x119>
804b324: d9 45 cc flds 0xffffffcc(%ebp)
804b327: d9 c9 fxch %st(1)
804b329: da e9 fucompp
804b32b: df e0 fnstsw %ax
804b32d: 9e sahf
804b32e: 7a 0b jp 804b33b
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
804b330: 75 09 jne 804b33b
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
804b332: ba 01 00 00 00 mov $0x1,%edx
804b337: eb 02 jmp 804b33b
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
804b339: dd d8 fstp %st(0)
804b33b: 85 d2 test %edx,%edx
804b33d: 0f 85 92 00 00 00 jne 804b3d5
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1b5>
804b343: 8b 7d 9c mov 0xffffff9c(%ebp),%edi
804b346: 85 ff test %edi,%edi
804b348: 0f 8e 87 00 00 00 jle 804b3d5
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1b5>
804b34e: 8b 7d 0c mov 0xc(%ebp),%edi
804b351: 8b 4d 08 mov 0x8(%ebp),%ecx
804b354: 8b 45 88 mov 0xffffff88(%ebp),%eax
804b357: 8b 5f 08 mov 0x8(%edi),%ebx
804b35a: 8b 71 08 mov 0x8(%ecx),%esi
804b35d: 8b 55 90 mov 0xffffff90(%ebp),%edx
804b360: 0f af c3 imul %ebx,%eax
804b363: 0f af d6 imul %esi,%edx
804b366: 8d 34 c5 08 00 00 00 lea 0x8(,%eax,8),%esi
804b36d: 8b 41 04 mov 0x4(%ecx),%eax
804b370: 8b 4d 9c mov 0xffffff9c(%ebp),%ecx
804b373: 8d 1c d5 08 00 00 00 lea 0x8(,%edx,8),%ebx
804b37a: 89 45 8c mov %eax,0xffffff8c(%ebp)
804b37d: 8d 76 00 lea 0x0(%esi),%esi
804b380: d9 45 d8 flds 0xffffffd8(%ebp)
804b383: 89 f0 mov %esi,%eax
804b385: 83 c6 08 add $0x8,%esi
804b388: d9 45 dc flds 0xffffffdc(%ebp)
804b38b: 8b 7d 0c mov 0xc(%ebp),%edi
804b38e: d9 c1 fld %st(1)
804b390: d9 c1 fld %st(1)
804b392: 8b 55 8c mov 0xffffff8c(%ebp),%edx
804b395: 03 47 04 add 0x4(%edi),%eax
804b398: 01 da add %ebx,%edx
804b39a: 83 c3 08 add $0x8,%ebx
804b39d: 49 dec %ecx
804b39e: d9 40 f8 flds 0xfffffff8(%eax)
804b3a1: dc ca fmul %st,%st(2)
804b3a3: dc cb fmul %st,%st(3)
804b3a5: d9 5d a8 fstps 0xffffffa8(%ebp)
804b3a8: d9 40 fc flds 0xfffffffc(%eax)
804b3ab: dc c9 fmul %st,%st(1)
804b3ad: dc cc fmul %st,%st(4)
804b3af: d9 5d ac fstps 0xffffffac(%ebp)
804b3b2: de e9 fsubrp %st,%st(1)
804b3b4: d9 ca fxch %st(2)
804b3b6: de c1 faddp %st,%st(1)
804b3b8: d9 c9 fxch %st(1)
804b3ba: d9 55 b8 fsts 0xffffffb8(%ebp)
804b3bd: d9 c9 fxch %st(1)
804b3bf: d9 5d bc fstps 0xffffffbc(%ebp)
804b3c2: d9 42 f8 flds 0xfffffff8(%edx)
804b3c5: de c1 faddp %st,%st(1)
804b3c7: d9 5a f8 fstps 0xfffffff8(%edx)
804b3ca: d9 42 fc flds 0xfffffffc(%edx)
804b3cd: d8 45 bc fadds 0xffffffbc(%ebp)
804b3d0: d9 5a fc fstps 0xfffffffc(%edx)
804b3d3: 75 ab jne 804b380
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0x160>
804b3d5: ff 45 a0 incl 0xffffffa0(%ebp)
804b3d8: 8b 45 94 mov 0xffffff94(%ebp),%eax
804b3db: ff 45 88 incl 0xffffff88(%ebp)
804b3de: 83 85 74 ff ff ff 08 addl $0x8,0xffffff74(%ebp)
804b3e5: 39 45 a0 cmp %eax,0xffffffa0(%ebp)
804b3e8: 0f 8e 02 ff ff ff jle 804b2f0
<_Z7cmatMulR13ComplexMatrixRKS_S2_+0xd0>
Indeed, in the innermost loop (that over i, for 3.0: 804b340-804b3ad, for
3.0pre: 804b3d3-804b380) there are clear differences, f.i., 8 mov for 3.0 vs
only 3 mov (1 between regs) for 3.0pre !!
But at this point my recollections from my basic university course of x86
assembler are running out and I definitely need some more help from the list to
understand more of this disappointing behavior of the released 3.0.
A final obervation: on my system current 3.1 snapshots performs on the test
similarly to 3.0, that is similarly to 2.95.x, that is worse :( than 3.0pre,
which consistently in the last three months showed C++ very close to Hand-coded
C...
Cheers,
Paolo.
// Complex
void cmatMul(ComplexMatrix &t, const ComplexMatrix &a, const ComplexMatrix &b)
{
804b220: 55 push %ebp
804b221: 89 e5 mov %esp,%ebp
804b223: 57 push %edi
const T &data() const { return *d; }
Boolean ownData() const { return myData; }
Integer dim(Integer i) const { return n[i - 1]; }
804b224: 8b 45 0c mov 0xc(%ebp),%eax
804b227: 56 push %esi
BaseArray<T> &operator=(const BaseArray<T> &a);
operator const T *() const { return d; }
operator T *() { return d; }
804b228: 8b 55 08 mov 0x8(%ebp),%edx
804b22b: 53 push %ebx
804b22c: 83 c4 80 add $0xffffff80,%esp
const T &data() const { return *d; }
Boolean ownData() const { return myData; }
Integer dim(Integer i) const { return n[i - 1]; }
804b22f: 8b 40 08 mov 0x8(%eax),%eax
804b232: 89 45 a0 mov %eax,0xffffffa0(%ebp)
804b235: 8b 45 10 mov 0x10(%ebp),%eax
804b238: 8b 40 0c mov 0xc(%eax),%eax
804b23b: 89 45 9c mov %eax,0xffffff9c(%ebp)
804b23e: 8b 45 10 mov 0x10(%ebp),%eax
804b241: 8b 5d 9c mov 0xffffff9c(%ebp),%ebx
804b244: 8b 40 08 mov 0x8(%eax),%eax
}
BaseComplex(T rp, T ip = 0.0)
{
re = rp;
im = ip;
804b247: c7 45 cc 00 00 00 00 movl $0x0,0xffffffcc(%ebp)
804b24e: c7 45 dc 00 00 00 00 movl $0x0,0xffffffdc(%ebp)
804b255: c7 45 d8 00 00 00 00 movl $0x0,0xffffffd8(%ebp)
const T &data() const { return *d; }
Boolean ownData() const { return myData; }
Integer dim(Integer i) const { return n[i - 1]; }
804b25c: 89 45 98 mov %eax,0xffffff98(%ebp)
804b25f: 8b 45 a0 mov 0xffffffa0(%ebp),%eax
re = im = T(0.0);
}
BaseComplex(T rp, T ip = 0.0)
{
re = rp;
804b262: c7 45 c8 00 00 00 00 movl $0x0,0xffffffc8(%ebp)
BaseArray<T> &operator=(const BaseArray<T> &a);
operator const T *() const { return d; }
operator T *() { return d; }
804b269: 0f af c3 imul %ebx,%eax
804b26c: c1 e0 03 shl $0x3,%eax
804b26f: 50 push %eax
804b270: 6a 00 push $0x0
804b272: 8b 4a 04 mov 0x4(%edx),%ecx
804b275: 51 push %ecx
804b276: e8 85 e5 ff ff call 8049800 <_init+0x268>
Integer i, j, k;
const Integer M = a.dim(1), N = b.dim(2), K = b.dim(1);
Complex temp;
const Complex zero = Complex(0.0);
memset(t, 0, M * N * sizeof(Complex));
for (j = 1; j <= N; j++)
804b27b: 8b 4d 9c mov 0xffffff9c(%ebp),%ecx
804b27e: 83 c4 10 add $0x10,%esp
804b281: c7 45 a4 01 00 00 00 movl $0x1,0xffffffa4(%ebp)
804b288: 39 4d a4 cmp %ecx,0xffffffa4(%ebp)
804b28b: 0f 8f 3d 01 00 00 jg 804b3ce <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1ae>
804b291: c7 45 8c 00 00 00 00 movl $0x0,0xffffff8c(%ebp)
804b298: 90 nop
804b299: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi
{
for (k = 1; k <= K; k++)
804b2a0: be 01 00 00 00 mov $0x1,%esi
804b2a5: 3b 75 98 cmp 0xffffff98(%ebp),%esi
804b2a8: 0f 8f 0e 01 00 00 jg 804b3bc <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x19c>
804b2ae: c7 45 90 00 00 00 00 movl $0x0,0xffffff90(%ebp)
804b2b5: 8d 74 26 00 lea 0x0(%esi,1),%esi
804b2b9: 8d bc 27 00 00 00 00 lea 0x0(%edi,1),%edi
im = c.im;
}
int operator==(const BaseComplex<T> &c) const
{
804b2c0: d9 45 c8 flds 0xffffffc8(%ebp)
804b2c3: 8b 7d 10 mov 0x10(%ebp),%edi
804b2c6: 8b 45 a4 mov 0xffffffa4(%ebp),%eax
804b2c9: 8b 57 08 mov 0x8(%edi),%edx
804b2cc: 48 dec %eax
804b2cd: 0f af c2 imul %edx,%eax
804b2d0: 8b 57 04 mov 0x4(%edi),%edx
804b2d3: 8d 04 30 lea (%eax,%esi,1),%eax
804b2d6: 8d 04 c2 lea (%edx,%eax,8),%eax
804b2d9: 31 d2 xor %edx,%edx
804b2db: d9 40 f8 flds 0xfffffff8(%eax)
804b2de: d9 55 b8 fsts 0xffffffb8(%ebp)
804b2e1: d9 40 fc flds 0xfffffffc(%eax)
804b2e4: d9 c9 fxch %st(1)
804b2e6: dd e2 fucom %st(2)
804b2e8: df e0 fnstsw %ax
804b2ea: dd da fstp %st(2)
804b2ec: d9 c9 fxch %st(1)
return (re == c.re && im == c.im);
}
int operator!=(const BaseComplex<T> &c) const
{
return !(*this == c);
}
BaseComplex<T> invert() const
{
T normalize = (re * re)+(im * im);
return BaseComplex<T>((re / normalize), (-im / normalize));
}
BaseComplex<T> operator-() const
{
return BaseComplex<T>(-re, -im);
}
BaseComplex<T> conj() const
{
return BaseComplex<T>(re, -im);
}
int operator!() const
{
return ((re == 0.0) ? 1 : 0);
}
BaseComplex<T>& operator=(const BaseComplex<T> &c)
{
re = c.re;
804b2ee: d9 5d d8 fstps 0xffffffd8(%ebp)
804b2f1: 9e sahf
804b2f2: d9 55 bc fsts 0xffffffbc(%ebp)
im = c.im;
804b2f5: d9 55 dc fsts 0xffffffdc(%ebp)
804b2f8: 7a 17 jp 804b311 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf1>
804b2fa: 75 15 jne 804b311 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf1>
804b2fc: d9 45 cc flds 0xffffffcc(%ebp)
804b2ff: d9 c9 fxch %st(1)
804b301: da e9 fucompp
804b303: df e0 fnstsw %ax
804b305: 9e sahf
804b306: 7a 0b jp 804b313 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
804b308: 75 09 jne 804b313 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
804b30a: ba 01 00 00 00 mov $0x1,%edx
804b30f: eb 02 jmp 804b313 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xf3>
804b311: dd d8 fstp %st(0)
804b313: 85 d2 test %edx,%edx
804b315: 0f 85 94 00 00 00 jne 804b3af <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x18f>
{
temp = b.index(k, j);
if (temp != zero)
{
for (i = 1; i <= M; i++)
804b31b: bb 01 00 00 00 mov $0x1,%ebx
804b320: 3b 5d a0 cmp 0xffffffa0(%ebp),%ebx
804b323: 0f 8f 86 00 00 00 jg 804b3af <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x18f>
804b329: 8b 55 8c mov 0xffffff8c(%ebp),%edx
804b32c: 8b 4d 90 mov 0xffffff90(%ebp),%ecx
804b32f: 89 55 94 mov %edx,0xffffff94(%ebp)
804b332: 89 4d 84 mov %ecx,0xffffff84(%ebp)
804b335: 8d 74 26 00 lea 0x0(%esi,1),%esi
804b339: 8d bc 27 00 00 00 00 lea 0x0(%edi,1),%edi
{
re = im = T(0.0);
}
BaseComplex(T rp, T ip = 0.0)
{
804b340: d9 45 d8 flds 0xffffffd8(%ebp)
return d[i - b[0] + n[0] * (j - b[1])];
}
Complex &index(Integer i, Integer j)
{
804b343: 8b 7d 08 mov 0x8(%ebp),%edi
804b346: 8b 4d 94 mov 0xffffff94(%ebp),%ecx
{
re = im = T(0.0);
}
BaseComplex(T rp, T ip = 0.0)
{
804b349: d9 45 dc flds 0xffffffdc(%ebp)
return d[i - b[0] + n[0] * (j - b[1])];
}
Complex &index(Integer i, Integer j)
{
804b34c: 8b 47 08 mov 0x8(%edi),%eax
{
re = im = T(0.0);
}
BaseComplex(T rp, T ip = 0.0)
{
804b34f: d9 c1 fld %st(1)
re = rp;
im = ip;
}
BaseComplex(const BaseComplex<T> &c)
{
804b351: 8b 55 84 mov 0xffffff84(%ebp),%edx
804b354: d9 c1 fld %st(1)
return d[i - b[0] + n[0] * (j - b[1])];
}
Complex &index(Integer i, Integer j)
{
804b356: 0f af c8 imul %eax,%ecx
804b359: 8b 47 04 mov 0x4(%edi),%eax
804b35c: 8d 0c 19 lea (%ecx,%ebx,1),%ecx
804b35f: 8d 0c c8 lea (%eax,%ecx,8),%ecx
re = rp;
im = ip;
}
BaseComplex(const BaseComplex<T> &c)
{
804b362: 8b 45 0c mov 0xc(%ebp),%eax
804b365: 8b 78 08 mov 0x8(%eax),%edi
804b368: 8b 40 04 mov 0x4(%eax),%eax
804b36b: 0f af d7 imul %edi,%edx
804b36e: 8d 14 1a lea (%edx,%ebx,1),%edx
804b371: 43 inc %ebx
re = rp;
im = ip;
}
BaseComplex(const BaseComplex<T> &c)
{
804b372: 8d 14 d0 lea (%eax,%edx,8),%edx
re = c.re;
804b375: d9 42 f8 flds 0xfffffff8(%edx)
804b378: dc ca fmul %st,%st(2)
804b37a: dc cb fmul %st,%st(3)
804b37c: d9 5d a8 fstps 0xffffffa8(%ebp)
im = c.im;
804b37f: d9 42 fc flds 0xfffffffc(%edx)
804b382: dc c9 fmul %st,%st(1)
804b384: dc cc fmul %st,%st(4)
804b386: d9 5d ac fstps 0xffffffac(%ebp)
804b389: de e9 fsubrp %st,%st(1)
804b38b: d9 ca fxch %st(2)
804b38d: de c1 faddp %st,%st(1)
804b38f: d9 c9 fxch %st(1)
804b391: d9 55 b8 fsts 0xffffffb8(%ebp)
804b394: d9 c9 fxch %st(1)
804b396: d9 5d bc fstps 0xffffffbc(%ebp)
}
int operator==(const BaseComplex<T> &c) const
{
return (re == c.re && im == c.im);
}
int operator!=(const BaseComplex<T> &c) const
{
return !(*this == c);
}
BaseComplex<T> invert() const
{
T normalize = (re * re)+(im * im);
return BaseComplex<T>((re / normalize), (-im / normalize));
}
BaseComplex<T> operator-() const
{
return BaseComplex<T>(-re, -im);
}
BaseComplex<T> conj() const
{
return BaseComplex<T>(re, -im);
}
int operator!() const
{
return ((re == 0.0) ? 1 : 0);
}
BaseComplex<T>& operator=(const BaseComplex<T> &c)
{
re = c.re;
im = c.im;
return *this;
}
void operator+=(const BaseComplex<T> &c)
{
re += c.re;
804b399: d9 41 f8 flds 0xfffffff8(%ecx)
804b39c: de c1 faddp %st,%st(1)
804b39e: d9 59 f8 fstps 0xfffffff8(%ecx)
im += c.im;
804b3a1: d9 41 fc flds 0xfffffffc(%ecx)
804b3a4: d8 45 bc fadds 0xffffffbc(%ebp)
804b3a7: d9 59 fc fstps 0xfffffffc(%ecx)
804b3aa: 3b 5d a0 cmp 0xffffffa0(%ebp),%ebx
804b3ad: 7e 91 jle 804b340 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x120>
804b3af: ff 45 90 incl 0xffffff90(%ebp)
804b3b2: 46 inc %esi
804b3b3: 3b 75 98 cmp 0xffffff98(%ebp),%esi
804b3b6: 0f 8e 04 ff ff ff jle 804b2c0 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xa0>
804b3bc: ff 45 a4 incl 0xffffffa4(%ebp)
804b3bf: 8b 55 9c mov 0xffffff9c(%ebp),%edx
804b3c2: ff 45 8c incl 0xffffff8c(%ebp)
804b3c5: 39 55 a4 cmp %edx,0xffffffa4(%ebp)
804b3c8: 0f 8e d2 fe ff ff jle 804b2a0 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x80>
t.index(i, j) += temp * a.index(i, k);
}
}
}
}
804b3ce: 8d 65 f4 lea 0xfffffff4(%ebp),%esp
804b3d1: 5b pop %ebx
804b3d2: 5e pop %esi
804b3d3: 5f pop %edi
804b3d4: 5d pop %ebp
804b3d5: c3 ret
// Complex
void cmatMul(ComplexMatrix &t, const ComplexMatrix &a, const ComplexMatrix &b)
{
804b220: 55 push %ebp
804b221: 89 e5 mov %esp,%ebp
804b223: 57 push %edi
804b224: 56 push %esi
804b225: 53 push %ebx
804b226: 81 ec 90 00 00 00 sub $0x90,%esp
BaseArray<T> &operator=(const BaseArray<T> &a);
operator const T *() const { return d; }
operator T *() { return d; }
804b22c: 8b 55 08 mov 0x8(%ebp),%edx
}
BaseComplex(T rp, T ip = 0.0)
{
re = rp;
im = ip;
804b22f: c7 45 cc 00 00 00 00 movl $0x0,0xffffffcc(%ebp)
const T &data() const { return *d; }
Boolean ownData() const { return myData; }
Integer dim(Integer i) const { return n[i - 1]; }
804b236: 8b 45 0c mov 0xc(%ebp),%eax
T im;
BaseComplex()
{
re = im = T(0.0);
804b239: c7 45 dc 00 00 00 00 movl $0x0,0xffffffdc(%ebp)
804b240: c7 45 d8 00 00 00 00 movl $0x0,0xffffffd8(%ebp)
const T &data() const { return *d; }
Boolean ownData() const { return myData; }
Integer dim(Integer i) const { return n[i - 1]; }
804b247: 8b 40 08 mov 0x8(%eax),%eax
re = im = T(0.0);
}
BaseComplex(T rp, T ip = 0.0)
{
re = rp;
804b24a: c7 45 c8 00 00 00 00 movl $0x0,0xffffffc8(%ebp)
const T &data() const { return *d; }
Boolean ownData() const { return myData; }
Integer dim(Integer i) const { return n[i - 1]; }
804b251: 89 45 9c mov %eax,0xffffff9c(%ebp)
804b254: 8b 45 10 mov 0x10(%ebp),%eax
804b257: 8b 40 0c mov 0xc(%eax),%eax
804b25a: 89 45 98 mov %eax,0xffffff98(%ebp)
804b25d: 8b 45 10 mov 0x10(%ebp),%eax
804b260: 8b 75 98 mov 0xffffff98(%ebp),%esi
804b263: 8b 40 08 mov 0x8(%eax),%eax
804b266: 89 45 94 mov %eax,0xffffff94(%ebp)
804b269: 8b 45 9c mov 0xffffff9c(%ebp),%eax
804b26c: 0f af c6 imul %esi,%eax
804b26f: c1 e0 03 shl $0x3,%eax
804b272: 50 push %eax
804b273: 6a 00 push $0x0
804b275: 8b 5a 04 mov 0x4(%edx),%ebx
804b278: 53 push %ebx
804b279: e8 82 e5 ff ff call 8049800 <_init+0x268>
Integer i, j, k;
const Integer M = a.dim(1), N = b.dim(2), K = b.dim(1);
Complex temp;
const Complex zero = Complex(0.0);
memset(t, 0, M * N * sizeof(Complex));
for (j = 1; j <= N; j++)
804b27e: 8b 4d 98 mov 0xffffff98(%ebp),%ecx
804b281: 83 c4 10 add $0x10,%esp
804b284: c7 45 a4 01 00 00 00 movl $0x1,0xffffffa4(%ebp)
804b28b: 39 4d a4 cmp %ecx,0xffffffa4(%ebp)
804b28e: 0f 8f 6f 01 00 00 jg 804b403 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1e3>
804b294: c7 45 84 00 00 00 00 movl $0x0,0xffffff84(%ebp)
804b29b: c7 45 80 00 00 00 00 movl $0x0,0xffffff80(%ebp)
804b2a2: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi
804b2a9: 8d bc 27 00 00 00 00 lea 0x0(%edi,1),%edi
{
for (k = 1; k <= K; k++)
804b2b0: c7 45 a0 01 00 00 00 movl $0x1,0xffffffa0(%ebp)
804b2b7: 8b 7d 94 mov 0xffffff94(%ebp),%edi
804b2ba: 39 7d a0 cmp %edi,0xffffffa0(%ebp)
804b2bd: 0f 8f 2b 01 00 00 jg 804b3ee <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1ce>
804b2c3: 8b 55 10 mov 0x10(%ebp),%edx
804b2c6: 8b 45 80 mov 0xffffff80(%ebp),%eax
804b2c9: 8b 4a 08 mov 0x8(%edx),%ecx
804b2cc: c7 45 88 00 00 00 00 movl $0x0,0xffffff88(%ebp)
804b2d3: 0f af c1 imul %ecx,%eax
804b2d6: 8b 4d 84 mov 0xffffff84(%ebp),%ecx
804b2d9: 89 4d 90 mov %ecx,0xffffff90(%ebp)
804b2dc: 8d 04 c5 08 00 00 00 lea 0x8(,%eax,8),%eax
804b2e3: 89 85 74 ff ff ff mov %eax,0xffffff74(%ebp)
804b2e9: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi
im = c.im;
}
int operator==(const BaseComplex<T> &c) const
{
804b2f0: d9 45 c8 flds 0xffffffc8(%ebp)
804b2f3: 8b 7d 10 mov 0x10(%ebp),%edi
804b2f6: 8b 85 74 ff ff ff mov 0xffffff74(%ebp),%eax
804b2fc: 8b 57 04 mov 0x4(%edi),%edx
804b2ff: 01 d0 add %edx,%eax
804b301: 31 d2 xor %edx,%edx
804b303: d9 40 f8 flds 0xfffffff8(%eax)
804b306: d9 55 b8 fsts 0xffffffb8(%ebp)
804b309: d9 40 fc flds 0xfffffffc(%eax)
804b30c: d9 c9 fxch %st(1)
804b30e: dd e2 fucom %st(2)
804b310: df e0 fnstsw %ax
804b312: dd da fstp %st(2)
804b314: d9 c9 fxch %st(1)
return (re == c.re && im == c.im);
}
int operator!=(const BaseComplex<T> &c) const
{
return !(*this == c);
}
BaseComplex<T> invert() const
{
T normalize = (re * re)+(im * im);
return BaseComplex<T>((re / normalize), (-im / normalize));
}
BaseComplex<T> operator-() const
{
return BaseComplex<T>(-re, -im);
}
BaseComplex<T> conj() const
{
return BaseComplex<T>(re, -im);
}
int operator!() const
{
return ((re == 0.0) ? 1 : 0);
}
BaseComplex<T>& operator=(const BaseComplex<T> &c)
{
re = c.re;
804b316: d9 5d d8 fstps 0xffffffd8(%ebp)
804b319: 9e sahf
804b31a: d9 55 bc fsts 0xffffffbc(%ebp)
im = c.im;
804b31d: d9 55 dc fsts 0xffffffdc(%ebp)
804b320: 7a 17 jp 804b339 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x119>
804b322: 75 15 jne 804b339 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x119>
804b324: d9 45 cc flds 0xffffffcc(%ebp)
804b327: d9 c9 fxch %st(1)
804b329: da e9 fucompp
804b32b: df e0 fnstsw %ax
804b32d: 9e sahf
804b32e: 7a 0b jp 804b33b <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
804b330: 75 09 jne 804b33b <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
804b332: ba 01 00 00 00 mov $0x1,%edx
804b337: eb 02 jmp 804b33b <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x11b>
804b339: dd d8 fstp %st(0)
804b33b: 85 d2 test %edx,%edx
804b33d: 0f 85 92 00 00 00 jne 804b3d5 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1b5>
{
temp = b.index(k, j);
if (temp != zero)
{
for (i = 1; i <= M; i++)
804b343: 8b 7d 9c mov 0xffffff9c(%ebp),%edi
804b346: 85 ff test %edi,%edi
804b348: 0f 8e 87 00 00 00 jle 804b3d5 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x1b5>
804b34e: 8b 7d 0c mov 0xc(%ebp),%edi
804b351: 8b 4d 08 mov 0x8(%ebp),%ecx
804b354: 8b 45 88 mov 0xffffff88(%ebp),%eax
804b357: 8b 5f 08 mov 0x8(%edi),%ebx
804b35a: 8b 71 08 mov 0x8(%ecx),%esi
804b35d: 8b 55 90 mov 0xffffff90(%ebp),%edx
804b360: 0f af c3 imul %ebx,%eax
804b363: 0f af d6 imul %esi,%edx
804b366: 8d 34 c5 08 00 00 00 lea 0x8(,%eax,8),%esi
804b36d: 8b 41 04 mov 0x4(%ecx),%eax
804b370: 8b 4d 9c mov 0xffffff9c(%ebp),%ecx
804b373: 8d 1c d5 08 00 00 00 lea 0x8(,%edx,8),%ebx
804b37a: 89 45 8c mov %eax,0xffffff8c(%ebp)
804b37d: 8d 76 00 lea 0x0(%esi),%esi
{
re = im = T(0.0);
}
BaseComplex(T rp, T ip = 0.0)
{
804b380: d9 45 d8 flds 0xffffffd8(%ebp)
re = rp;
im = ip;
}
BaseComplex(const BaseComplex<T> &c)
{
804b383: 89 f0 mov %esi,%eax
804b385: 83 c6 08 add $0x8,%esi
{
re = im = T(0.0);
}
BaseComplex(T rp, T ip = 0.0)
{
804b388: d9 45 dc flds 0xffffffdc(%ebp)
re = rp;
im = ip;
}
BaseComplex(const BaseComplex<T> &c)
{
804b38b: 8b 7d 0c mov 0xc(%ebp),%edi
804b38e: d9 c1 fld %st(1)
804b390: d9 c1 fld %st(1)
return d[i - b[0] + n[0] * (j - b[1])];
}
Complex &index(Integer i, Integer j)
{
804b392: 8b 55 8c mov 0xffffff8c(%ebp),%edx
re = rp;
im = ip;
}
BaseComplex(const BaseComplex<T> &c)
{
804b395: 03 47 04 add 0x4(%edi),%eax
return d[i - b[0] + n[0] * (j - b[1])];
}
Complex &index(Integer i, Integer j)
{
804b398: 01 da add %ebx,%edx
804b39a: 83 c3 08 add $0x8,%ebx
804b39d: 49 dec %ecx
im = ip;
}
BaseComplex(const BaseComplex<T> &c)
{
re = c.re;
804b39e: d9 40 f8 flds 0xfffffff8(%eax)
804b3a1: dc ca fmul %st,%st(2)
804b3a3: dc cb fmul %st,%st(3)
804b3a5: d9 5d a8 fstps 0xffffffa8(%ebp)
im = c.im;
804b3a8: d9 40 fc flds 0xfffffffc(%eax)
804b3ab: dc c9 fmul %st,%st(1)
804b3ad: dc cc fmul %st,%st(4)
804b3af: d9 5d ac fstps 0xffffffac(%ebp)
804b3b2: de e9 fsubrp %st,%st(1)
804b3b4: d9 ca fxch %st(2)
804b3b6: de c1 faddp %st,%st(1)
804b3b8: d9 c9 fxch %st(1)
804b3ba: d9 55 b8 fsts 0xffffffb8(%ebp)
804b3bd: d9 c9 fxch %st(1)
804b3bf: d9 5d bc fstps 0xffffffbc(%ebp)
}
int operator==(const BaseComplex<T> &c) const
{
return (re == c.re && im == c.im);
}
int operator!=(const BaseComplex<T> &c) const
{
return !(*this == c);
}
BaseComplex<T> invert() const
{
T normalize = (re * re)+(im * im);
return BaseComplex<T>((re / normalize), (-im / normalize));
}
BaseComplex<T> operator-() const
{
return BaseComplex<T>(-re, -im);
}
BaseComplex<T> conj() const
{
return BaseComplex<T>(re, -im);
}
int operator!() const
{
return ((re == 0.0) ? 1 : 0);
}
BaseComplex<T>& operator=(const BaseComplex<T> &c)
{
re = c.re;
im = c.im;
return *this;
}
void operator+=(const BaseComplex<T> &c)
{
re += c.re;
804b3c2: d9 42 f8 flds 0xfffffff8(%edx)
804b3c5: de c1 faddp %st,%st(1)
804b3c7: d9 5a f8 fstps 0xfffffff8(%edx)
im += c.im;
804b3ca: d9 42 fc flds 0xfffffffc(%edx)
804b3cd: d8 45 bc fadds 0xffffffbc(%ebp)
804b3d0: d9 5a fc fstps 0xfffffffc(%edx)
804b3d3: 75 ab jne 804b380 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x160>
804b3d5: ff 45 a0 incl 0xffffffa0(%ebp)
804b3d8: 8b 45 94 mov 0xffffff94(%ebp),%eax
804b3db: ff 45 88 incl 0xffffff88(%ebp)
804b3de: 83 85 74 ff ff ff 08 addl $0x8,0xffffff74(%ebp)
804b3e5: 39 45 a0 cmp %eax,0xffffffa0(%ebp)
804b3e8: 0f 8e 02 ff ff ff jle 804b2f0 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0xd0>
804b3ee: ff 45 a4 incl 0xffffffa4(%ebp)
804b3f1: 8b 55 98 mov 0xffffff98(%ebp),%edx
804b3f4: ff 45 84 incl 0xffffff84(%ebp)
804b3f7: ff 45 80 incl 0xffffff80(%ebp)
804b3fa: 39 55 a4 cmp %edx,0xffffffa4(%ebp)
804b3fd: 0f 8e ad fe ff ff jle 804b2b0 <_Z7cmatMulR13ComplexMatrixRKS_S2_+0x90>
t.index(i, j) += temp * a.index(i, k);
}
}
}
}
804b403: 8d 65 f4 lea 0xfffffff4(%ebp),%esp
804b406: 5b pop %ebx
804b407: 5e pop %esi
804b408: 5f pop %edi
804b409: 5d pop %ebp
804b40a: c3 ret