This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
code generation puzzle on alpha
- To: gcc at gcc dot gnu dot org
- Subject: code generation puzzle on alpha
- From: Brad Lucier <lucier at math dot purdue dot edu>
- Date: Wed, 29 Sep 1999 14:12:53 -0500 (EST)
- Cc: lucier at math dot purdue dot edu, hosking at cs dot purdue dot edu, staff at math dot purdue dot edu
Because I have way too much time on my hands ;-), I intend to investigate
why gcc version 2.96 19990922 (experimental) on the alpha generates (1)
a float-to-int conversion instruction, when the result is never used, and
(2) several float-move instructions that are totally unnecessary, since
there are more than enough floating-point registers available to execute
this code without moves. This has been extracted from a larger file;
this routine is part of the inner loop of a molecular dynamics code I'm
writing (the C code is automatically generated by a Scheme->C compiler),
and it is worthwhile to make this run as fast as possible.
Since I am a rank amateur at reading rtl dumps, I invite anyone else who
may have an interest in improving this part of the compiler to jump in;
even if I start weeks ahead of you, you'll probably finish first.
Brad Lucier lucier@math.purdue.edu
The compiler options:
gcc -mcpu=ev6 -mieee -fno-math-errno -fPIC -O2 -S -D___DYNAMIC -D___SINGLE_HOST -save-temps -da crap.c
The C code:
extern double sqrt (double __x) ;
extern double __sqrt (double __x) ;
static long ___lp;
typedef long int __jmp_buf[17];
typedef struct
{
unsigned long int __val[(1024 / (8 * sizeof (unsigned long int))) ];
} __sigset_t;
typedef struct __jmp_buf_tag
{
__jmp_buf __jmpbuf;
int __mask_was_saved;
__sigset_t __saved_mask;
} jmp_buf[1];
typedef struct ___jmpbuf_struct
{
jmp_buf buf;
} ___jmpbuf_struct;
typedef struct ___processor_state_struct
{
long *stack, *stack_base, *stack_limit, *stack_trip, *stack_break, *fp;
long *heap, *heap_limit, *hp;
long r[20 ], pc, temp1, temp2, temp3, temp4;
int na, np;
int intr_enabled, intr_flag[3 ];
long glo_list_head, glo_list_tail;
long handler_break, handler_stack_limit, handler_heap_limit,
handler_not_proc, handler_not_proc_glo,
handler_wrong_nargs, handler_get_rest,
handler_get_key, handler_get_key_rest,
handler_force, handler_clam_conv_error,
handler_cdef_conv_error, handler_return_to_c,
initial_continuation;
long executable_wills, non_executable_wills;
___jmpbuf_struct *catcher;
} ___processor_state_struct, *___processor_state;
typedef struct ___glo_struct
{
long val, prm, next;
} ___glo_struct;
static long * ___glo_tbl[4 ];
static long ___H__20_test_2d_alpha2(___processor_state ___ps)
{
register long ___pc,
___temp;
register unsigned long ___temp64;
register long ___start = ___lp + ((1) * 4 * (1 << 3));
register long ___r0;
register long ___r1;
register long ___r2;
register long ___r3;
register long ___r5;
register long ___r6;
register long ___r7;
double ___F64R1;
double ___F64R3;
double ___F64R4;
double ___F64R5;
double ___F64R6;
double ___F64R7;
double ___F64R8;
double ___F64R9;
double ___F64R10;
___r0 = ___ps->r[0];
___r1 = ___ps->r[1];
___r2 = ___ps->r[2];
___r3 = ___ps->r[3];
___r5 = ___ps->r[5];
___r6 = ___ps->r[6];
___r7 = ___ps->r[7];
___pc = ___ps->pc;
___r3 = (*(long *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2))));
___F64R4 = *(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2)));
___r5 = (*(long *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (4L) << 2)) << (3 - 2))));
___F64R5 = *(double *) (((long) (((long *) (___r5 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
___r6 = (*(long *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (4L) << 2)) << (3 - 2))));
___F64R6 = *(double *) (((long) (((long *) (___r6 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
___F64R5 = (((*(double *) ((((long *) ((*(___glo_struct *) ___glo_tbl[3]).val - 1)) + 1))) * ___F64R5) * ___F64R6);
___r1 = (*(long *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2))));
___F64R6 = *(double *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2)));
___r7 = (*(long *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2))));
___F64R8 = *(double *) (((long) (((long *) (___r7 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2)));
___F64R6 = (___F64R6 - ___F64R8);
___F64R8 = (___F64R6 * ___F64R6);
___F64R9 = *(double *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
___F64R10 = *(double *) (((long) (((long *) (___r7 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
___F64R9 = (___F64R9 - ___F64R10);
___F64R10 = (___F64R9 * ___F64R9);
___F64R1 = *(double *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2)));
___F64R7 = *(double *) (((long) (((long *) (___r7 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2)));
___F64R1 = (___F64R1 - ___F64R7);
___F64R7 = (___F64R1 * ___F64R1);
___F64R7 = ((___F64R8 + ___F64R10) + ___F64R7);
___F64R8 = sqrt(___F64R7);
___F64R7 = (((*(double *) ((((long *) ((*(___glo_struct *) ___glo_tbl[2]).val - 1)) + 1))) * ___F64R7) * ___F64R8);
___F64R5 = (___F64R5 / ___F64R7);
___F64R6 = (___F64R5 * ___F64R6);
___F64R4 = (___F64R4 - ___F64R6);
*(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2))) = ___F64R4;
___F64R4 = *(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
___F64R7 = (___F64R5 * ___F64R9);
___F64R4 = (___F64R4 - ___F64R7);
*(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2))) = ___F64R4;
___F64R4 = *(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2)));
___F64R1 = (___F64R5 * ___F64R1);
___F64R4 = (___F64R4 - ___F64R1);
*(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2))) = ___F64R4;
___r2 = (*(long *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2))));
___F64R3 = *(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2)));
___F64R3 = (___F64R3 + ___F64R6);
*(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2))) = ___F64R3;
___F64R3 = *(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
___F64R3 = (___F64R3 + ___F64R7);
*(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2))) = ___F64R3;
___F64R3 = *(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2)));
___F64R1 = (___F64R3 + ___F64R1);
*(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2))) = ___F64R1;
___r1 = (___r2);
___ps->pc = ___pc;
___ps->r[1] = ___r1;
___ps->r[2] = ___r2;
___ps->r[3] = ___r3;
___ps->r[5] = ___r5;
___ps->r[6] = ___r6;
___ps->r[7] = ___r7;
return ___pc;
}
The assembler:
.file 1 "crap.c"
.set noat
.set noreorder
.arch ev6
.text
.align 5
.ent ___H__20_test_2d_alpha2
___H__20_test_2d_alpha2:
.eflag 48
.frame $30,0,$26,0
ldgp $29,0($27)
$___H__20_test_2d_alpha2..ng:
.prologue 1
lda $3,___glo_tbl
ldq $1,80($16)
ldq $2,88($16)
ldt $f15,232($16)
ldq $5,24($3)
ldq $8,39($1)
ldq $22,39($2)
ldq $6,7($2)
ldq $7,16($3)
ldq $3,23($1)
ldq $4,0($5)
ldt $f24,15($8)
ldt $f25,15($22)
ldq $1,7($1)
ldt $f27,7($6)
ldt $f26,15($6)
ldt $f11,7($4)
ldq $4,0($7)
ldt $f13,15($1)
ldt $f12,23($1)
ldt $f23,23($6)
ldt $f22,7($3)
ldt $f14,7($4)
ldq $2,23($2)
multsu $f11,$f24,$f10
ftoit $f15,$0
subtsu $f13,$f26,$f11
multsu $f10,$f25,$f24
ldt $f25,7($1)
fmov $f11,$f13
subtsu $f12,$f23,$f11
subtsu $f25,$f27,$f10
multsu $f13,$f13,$f26
fmov $f11,$f12
fmov $f10,$f25
multsu $f12,$f12,$f23
multsu $f25,$f25,$f27
addtsu $f27,$f26,$f10
addtsu $f23,$f10,$f11
fmov $f11,$f23
sqrttsu $f23,$f11
multsu $f14,$f23,$f10
multsu $f10,$f11,$f23
divtsu $f24,$f23,$f10
fmov $f10,$f24
multsu $f25,$f24,$f11
multsu $f24,$f13,$f23
multsu $f12,$f24,$f10
fmov $f11,$f25
fmov $f10,$f12
subtsu $f22,$f25,$f11
fmov $f11,$f22
stt $f22,7($3)
ldt $f22,15($3)
subtsu $f22,$f23,$f10
fmov $f10,$f22
stt $f22,15($3)
ldt $f22,23($3)
subtsu $f22,$f12,$f11
fmov $f11,$f22
stt $f22,23($3)
ldt $f10,7($2)
addtsu $f10,$f25,$f11
fmov $f11,$f10
stt $f10,7($2)
ldt $f10,15($2)
addtsu $f10,$f23,$f11
fmov $f11,$f10
stt $f10,15($2)
ldt $f10,23($2)
addtsu $f12,$f10,$f11
fmov $f11,$f12
stt $f12,23($2)
stq $6,128($16)
stq $2,88($16)
stq $3,96($16)
stq $8,112($16)
stq $22,120($16)
stq $2,80($16)
ret $31,($26),1
.end ___H__20_test_2d_alpha2
.section .sbss,"aw"
.type ___lp,@object
.size ___lp,8
.align 3
___lp:
.zero 8
.section .bss
.type ___glo_tbl,@object
.size ___glo_tbl,32
.align 3
___glo_tbl:
.zero 32
.ident "GCC: (GNU) 2.96 19990922 (experimental)"