This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: PATCH:[darwin] fix load of a misaligned double word


Dale Johannesen <dalej@apple.com> writes:

> On Jan 19, 2004, at 3:14 PM, Geoff Keating wrote:
> > 4. It's faster to copy a value using GPRs than FPRs in many cases.
> 
> Slower in other cases:
> 
> double a[100];
> main() {
>    int i;
>    for (i=0; i<100; i++)
>      a[i] = 3.0;
> }

I suspect that was not a very good example.  I used this program, with
Apple's gcc-1495:

#include <stdio.h>
#include <ppc_intrinsics.h>

inline
unsigned long long ppc_intrinsic_time(void)
{ 
  unsigned long hi, lo;
  do
    { 
      hi = __mftbu();
      lo = __mftb();
    } while (hi != __mftbu());
  return ((unsigned long long)hi << 32) + lo;
}

double a[200];

main()
{ 
  int i;
  unsigned long long t1, t2, t3;

  /* pre-load into cache.  */
  for (i=0; i<100; i++)
    a[i] = 1.0;

  t1 = ppc_intrinsic_time();
  for (i=0; i<100; i++)
    a[i] = 3.0;
  t2 = ppc_intrinsic_time();
  for (i=0; i<100; i++)
    asm ("stfd%U0%X0 %1,%0" : "=m" (a[i]) : "f"(3.0));
  t3 = ppc_intrinsic_time();

  printf ("time1: %llu\n", t2 - t1);
  printf ("time2: %llu\n", t3 - t2);
  return 0;
}

The assembly of the timed section is:

L7:
        mfspr r2, 269
        mfspr r11, 268
        mfspr r0, 269
        cmpw cr7,r2,r0
        bne cr7,L7
        rldicl r0,r2,0,32
        addis r2,r31,ha16(L_a$non_lazy_ptr-"L00000000001$pb")
        lwz r9,lo16(L_a$non_lazy_ptr-"L00000000001$pb")(r2)
        sldi r0,r0,32
        rldicl r2,r11,0,32
        lis r11,0x4008
        add r10,r0,r2
        sldi r11,r11,32
        addi r0,r9,792
L16:
        std r11,0(r9)
        addi r9,r9,8
        cmpw cr7,r9,r0
        ble++ cr7,L16
L17:
        mfspr r2, 269
        mfspr r11, 268
        mfspr r0, 269
        cmpw cr7,r2,r0
        bne cr7,L17
        rldicl r0,r2,0,32
        addis r2,r31,ha16(L_a$non_lazy_ptr-"L00000000001$pb")
        sldi r0,r0,32
        lwz r9,lo16(L_a$non_lazy_ptr-"L00000000001$pb")(r2)
        rldicl r2,r11,0,32
        addis r11,r31,ha16(LC2-"L00000000001$pb")
        add r0,r0,r2
        lfd f0,lo16(LC2-"L00000000001$pb")(r11)
        std r0,64(r1)
        addi r0,r9,792
L26:
        stfd f0,0(r9)
        addi r9,r9,8
        cmpw cr7,r9,r0
        ble++ cr7,L26
L27:
        mfspr r9, 269
        mfspr r2, 268
        mfspr r0, 269

I ran 4 runs, and got:

time1: 16
time2: 158
time1: 12
time2: 146
time1: 22
time2: 131
time1: 21
time2: 139

So, 5-10x slower for FPRs.

-- 
- Geoffrey Keating <geoffk@geoffk.org>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]