This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PING][PATCH/RFT] Fix PR middle-end/PR28690, modify swap_commutative_operands_p


On Sun, May 06, 2007 at 01:21:16PM -0700, H. J. Lu wrote:
> Here are what I got on Core 2 Duo with -O2 -ffast-math:
> 
> 2. SPEC CPU 2000 cores
[snip]
> 168.wupwise                      -6.56276%

Well, we found a Core 2 Duo box here and we're not seeing as big
a degradation as you are (-3.9%), but we seem to have narrowed
down one slowdown and created a C testcase that seems to mimic the
slowdown.  Can you confirm that the testcase below runs slower
for you too.  Also, we don't know the x86/x86_64 arch that well,
so can you tell us why the code is running slower so I can attempt
to modify the patch so it doesn't?  Thanks.

I'll note the slow down is in the innermost loop and the difference
in disassembly is attached below.

Peter

(-O2 -ffast-math)

Base:
        .p2align 4,,7
.L3:
        movsd   8(%r8), %xmm1
        addl    $1, %r9d
        movsd   (%rax), %xmm4
        xorpd   %xmm8, %xmm1
        movsd   (%r8), %xmm5
        movapd  %xmm4, %xmm0
        movsd   8(%rax), %xmm3
        addq    $16, %r8
        movapd  %xmm1, %xmm2
        mulsd   %xmm5, %xmm0
        mulsd   %xmm4, %xmm1
        addq    $16, %rax
        cmpl    %r9d, %edi
        mulsd   %xmm3, %xmm2
        mulsd   %xmm5, %xmm3
        subsd   %xmm2, %xmm0
        addsd   %xmm3, %xmm1
        addsd   %xmm0, %xmm6
        addsd   %xmm1, %xmm7
        jg      .L3

Indexedload:
        .p2align 4,,7
.L3:
        movsd   8(%r8), %xmm3
        addl    $1, %r9d
        movsd   (%rax), %xmm4
        movsd   8(%rax), %xmm2
        addq    $16, %rax
        xorpd   %xmm8, %xmm3
        movapd  %xmm4, %xmm0
        movsd   (%r8), %xmm5
        addq    $16, %r8
        movapd  %xmm2, %xmm1
        cmpl    %edi, %r9d
        mulsd   %xmm5, %xmm0
        mulsd   %xmm3, %xmm1
        mulsd   %xmm3, %xmm4
        mulsd   %xmm5, %xmm2
        subsd   %xmm1, %xmm0
        addsd   %xmm4, %xmm2
        addsd   %xmm0, %xmm6
        addsd   %xmm2, %xmm7
        jl      .L3


linux% cat <<EOF > wupwise.c
extern double _Complex conj (double _Complex __z) __attribute__ ((__nothrow__));

#define DELAY 2000000
#define SIZE 1024

void
zgemm (int k, _Complex *a, _Complex *b, _Complex *c)
{
  int i, l;
  _Complex double temp = (0.0,0.0);

  for (i=0; i < DELAY; i++)
    for (l=0; l < k; l++)
      temp = temp + conj (a[l]) * b[l];
  c[0] = temp;
}

int
main (void)
{
  _Complex a[SIZE];
  _Complex b[SIZE];
  _Complex c[SIZE];

  zgemm (SIZE, a, b, c);
  return 0;
}
EOF


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]