This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: PATCH:[darwin] fix load of a misaligned double word
Dale Johannesen <dalej@apple.com> writes:
> On Jan 19, 2004, at 3:14 PM, Geoff Keating wrote:
> > 4. It's faster to copy a value using GPRs than FPRs in many cases.
>
> Slower in other cases:
>
> double a[100];
> main() {
> int i;
> for (i=0; i<100; i++)
> a[i] = 3.0;
> }
I suspect that was not a very good example. I used this program, with
Apple's gcc-1495:
#include <stdio.h>
#include <ppc_intrinsics.h>
inline
unsigned long long ppc_intrinsic_time(void)
{
unsigned long hi, lo;
do
{
hi = __mftbu();
lo = __mftb();
} while (hi != __mftbu());
return ((unsigned long long)hi << 32) + lo;
}
double a[200];
main()
{
int i;
unsigned long long t1, t2, t3;
/* pre-load into cache. */
for (i=0; i<100; i++)
a[i] = 1.0;
t1 = ppc_intrinsic_time();
for (i=0; i<100; i++)
a[i] = 3.0;
t2 = ppc_intrinsic_time();
for (i=0; i<100; i++)
asm ("stfd%U0%X0 %1,%0" : "=m" (a[i]) : "f"(3.0));
t3 = ppc_intrinsic_time();
printf ("time1: %llu\n", t2 - t1);
printf ("time2: %llu\n", t3 - t2);
return 0;
}
The assembly of the timed section is:
L7:
mfspr r2, 269
mfspr r11, 268
mfspr r0, 269
cmpw cr7,r2,r0
bne cr7,L7
rldicl r0,r2,0,32
addis r2,r31,ha16(L_a$non_lazy_ptr-"L00000000001$pb")
lwz r9,lo16(L_a$non_lazy_ptr-"L00000000001$pb")(r2)
sldi r0,r0,32
rldicl r2,r11,0,32
lis r11,0x4008
add r10,r0,r2
sldi r11,r11,32
addi r0,r9,792
L16:
std r11,0(r9)
addi r9,r9,8
cmpw cr7,r9,r0
ble++ cr7,L16
L17:
mfspr r2, 269
mfspr r11, 268
mfspr r0, 269
cmpw cr7,r2,r0
bne cr7,L17
rldicl r0,r2,0,32
addis r2,r31,ha16(L_a$non_lazy_ptr-"L00000000001$pb")
sldi r0,r0,32
lwz r9,lo16(L_a$non_lazy_ptr-"L00000000001$pb")(r2)
rldicl r2,r11,0,32
addis r11,r31,ha16(LC2-"L00000000001$pb")
add r0,r0,r2
lfd f0,lo16(LC2-"L00000000001$pb")(r11)
std r0,64(r1)
addi r0,r9,792
L26:
stfd f0,0(r9)
addi r9,r9,8
cmpw cr7,r9,r0
ble++ cr7,L26
L27:
mfspr r9, 269
mfspr r2, 268
mfspr r0, 269
I ran 4 runs, and got:
time1: 16
time2: 158
time1: 12
time2: 146
time1: 22
time2: 131
time1: 21
time2: 139
So, 5-10x slower for FPRs.
--
- Geoffrey Keating <geoffk@geoffk.org>