This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

code generation puzzle on alpha


Because I have way too much time on my hands ;-), I intend to investigate
why gcc version 2.96 19990922 (experimental) on the alpha generates (1)
a float-to-int conversion instruction, when the result is never used, and
(2) several float-move instructions that are totally unnecessary, since
there are more than enough floating-point registers available to execute
this code without moves.  This has been extracted from a larger file;
this routine is part of the inner loop of a molecular dynamics code I'm
writing (the C code is automatically generated by a Scheme->C compiler),
and it is worthwhile to make this run as fast as possible.

Since I am a rank amateur at reading rtl dumps, I invite anyone else who
may have an interest in improving this part of the compiler to jump in;
even if I start weeks ahead of you, you'll probably finish first.

Brad Lucier     lucier@math.purdue.edu

The compiler options:

gcc -mcpu=ev6 -mieee -fno-math-errno -fPIC -O2 -S -D___DYNAMIC -D___SINGLE_HOST -save-temps -da crap.c


The C code:

extern   double          sqrt          (double  __x)    ; 
extern   double        __sqrt          (double  __x)    ;

static  long   ___lp;  

typedef long int __jmp_buf[17];

typedef struct
  {
    unsigned long int __val[(1024 / (8 * sizeof (unsigned long int))) ];
  } __sigset_t;

typedef struct __jmp_buf_tag	 
  {
    __jmp_buf __jmpbuf;		 
    int __mask_was_saved;	 
    __sigset_t __saved_mask;	 
  } jmp_buf[1];

typedef struct ___jmpbuf_struct
  {
    jmp_buf buf;
  } ___jmpbuf_struct;

typedef struct ___processor_state_struct
  {
    long   *stack, *stack_base, *stack_limit, *stack_trip, *stack_break, *fp;
    long   *heap, *heap_limit, *hp;
    long   r[20 ], pc, temp1, temp2, temp3, temp4;
    int na, np;
    int intr_enabled, intr_flag[3 ];
    long   glo_list_head, glo_list_tail;
    long   handler_break, handler_stack_limit, handler_heap_limit,
            handler_not_proc, handler_not_proc_glo,
            handler_wrong_nargs, handler_get_rest,
            handler_get_key, handler_get_key_rest,
            handler_force, handler_clam_conv_error,
            handler_cdef_conv_error, handler_return_to_c,
            initial_continuation;
    long   executable_wills, non_executable_wills;

    ___jmpbuf_struct *catcher;

  } ___processor_state_struct, *___processor_state;

typedef struct ___glo_struct
  {
    long   val, prm, next;
  } ___glo_struct;

static  long  *  ___glo_tbl[4 ]; 

static long ___H__20_test_2d_alpha2(___processor_state ___ps)
{
    register long ___pc,
         ___temp;
    register unsigned long ___temp64;
    register long ___start = ___lp + ((1) * 4 * (1 << 3));
    register long ___r0;
    register long ___r1;
    register long ___r2;
    register long ___r3;
    register long ___r5;
    register long ___r6;
    register long ___r7;
    double ___F64R1;
    double ___F64R3;
    double ___F64R4;
    double ___F64R5;
    double ___F64R6;
    double ___F64R7;
    double ___F64R8;
    double ___F64R9;
    double ___F64R10;
    ___r0 = ___ps->r[0];
    ___r1 = ___ps->r[1];
    ___r2 = ___ps->r[2];
    ___r3 = ___ps->r[3];
    ___r5 = ___ps->r[5];
    ___r6 = ___ps->r[6];
    ___r7 = ___ps->r[7];
    ___pc = ___ps->pc;
	  ___r3 = (*(long *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2))));
	  ___F64R4 = *(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2)));
	  ___r5 = (*(long *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (4L) << 2)) << (3 - 2))));
	  ___F64R5 = *(double *) (((long) (((long *) (___r5 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
	  ___r6 = (*(long *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (4L) << 2)) << (3 - 2))));
	  ___F64R6 = *(double *) (((long) (((long *) (___r6 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
	  ___F64R5 = (((*(double *) ((((long *) ((*(___glo_struct *) ___glo_tbl[3]).val - 1)) + 1))) * ___F64R5) * ___F64R6);
	  ___r1 = (*(long *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2))));
	  ___F64R6 = *(double *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2)));
	  ___r7 = (*(long *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2))));
	  ___F64R8 = *(double *) (((long) (((long *) (___r7 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2)));
	  ___F64R6 = (___F64R6 - ___F64R8);
	  ___F64R8 = (___F64R6 * ___F64R6);
	  ___F64R9 = *(double *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
	  ___F64R10 = *(double *) (((long) (((long *) (___r7 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
	  ___F64R9 = (___F64R9 - ___F64R10);
	  ___F64R10 = (___F64R9 * ___F64R9);
	  ___F64R1 = *(double *) (((long) (((long *) (___r1 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2)));
	  ___F64R7 = *(double *) (((long) (((long *) (___r7 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2)));
	  ___F64R1 = (___F64R1 - ___F64R7);
	  ___F64R7 = (___F64R1 * ___F64R1);
	  ___F64R7 = ((___F64R8 + ___F64R10) + ___F64R7);
	  ___F64R8 = sqrt(___F64R7);
	  ___F64R7 = (((*(double *) ((((long *) ((*(___glo_struct *) ___glo_tbl[2]).val - 1)) + 1))) * ___F64R7) * ___F64R8);
	  ___F64R5 = (___F64R5 / ___F64R7);
	  ___F64R6 = (___F64R5 * ___F64R6);
	  ___F64R4 = (___F64R4 - ___F64R6);
	  *(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2))) = ___F64R4;
	  ___F64R4 = *(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
	  ___F64R7 = (___F64R5 * ___F64R9);
	  ___F64R4 = (___F64R4 - ___F64R7);
	  *(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2))) = ___F64R4;
	  ___F64R4 = *(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2)));
	  ___F64R1 = (___F64R5 * ___F64R1);
	  ___F64R4 = (___F64R4 - ___F64R1);
	  *(double *) (((long) (((long *) (___r3 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2))) = ___F64R4;
	  ___r2 = (*(long *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2))));
	  ___F64R3 = *(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2)));
	  ___F64R3 = (___F64R3 + ___F64R6);
	  *(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (0L) << 2)) << (3 - 2))) = ___F64R3;
	  ___F64R3 = *(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2)));
	  ___F64R3 = (___F64R3 + ___F64R7);
	  *(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (1L) << 2)) << (3 - 2))) = ___F64R3;
	  ___F64R3 = *(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2)));
	  ___F64R1 = (___F64R3 + ___F64R1);
	  *(double *) (((long) (((long *) (___r2 - 1)) + 1)) + ((((long) (2L) << 2)) << (3 - 2))) = ___F64R1;
	  ___r1 = (___r2);
    ___ps->pc = ___pc;
    ___ps->r[1] = ___r1;
    ___ps->r[2] = ___r2;
    ___ps->r[3] = ___r3;
    ___ps->r[5] = ___r5;
    ___ps->r[6] = ___r6;
    ___ps->r[7] = ___r7;
    return ___pc;
}

The assembler:

	.file	1 "crap.c"
	.set noat
	.set noreorder
	.arch ev6
.text
	.align 5
	.ent ___H__20_test_2d_alpha2
___H__20_test_2d_alpha2:
	.eflag 48
	.frame $30,0,$26,0
	ldgp $29,0($27)
$___H__20_test_2d_alpha2..ng:
	.prologue 1
	lda $3,___glo_tbl
	ldq $1,80($16)
	ldq $2,88($16)
	ldt $f15,232($16)
	ldq $5,24($3)
	ldq $8,39($1)
	ldq $22,39($2)
	ldq $6,7($2)
	ldq $7,16($3)
	ldq $3,23($1)
	ldq $4,0($5)
	ldt $f24,15($8)
	ldt $f25,15($22)
	ldq $1,7($1)
	ldt $f27,7($6)
	ldt $f26,15($6)
	ldt $f11,7($4)
	ldq $4,0($7)
	ldt $f13,15($1)
	ldt $f12,23($1)
	ldt $f23,23($6)
	ldt $f22,7($3)
	ldt $f14,7($4)
	ldq $2,23($2)
	multsu $f11,$f24,$f10
	ftoit $f15,$0
	subtsu $f13,$f26,$f11
	multsu $f10,$f25,$f24
	ldt $f25,7($1)
	fmov $f11,$f13
	subtsu $f12,$f23,$f11
	subtsu $f25,$f27,$f10
	multsu $f13,$f13,$f26
	fmov $f11,$f12
	fmov $f10,$f25
	multsu $f12,$f12,$f23
	multsu $f25,$f25,$f27
	addtsu $f27,$f26,$f10
	addtsu $f23,$f10,$f11
	fmov $f11,$f23
	sqrttsu $f23,$f11
	multsu $f14,$f23,$f10
	multsu $f10,$f11,$f23
	divtsu $f24,$f23,$f10
	fmov $f10,$f24
	multsu $f25,$f24,$f11
	multsu $f24,$f13,$f23
	multsu $f12,$f24,$f10
	fmov $f11,$f25
	fmov $f10,$f12
	subtsu $f22,$f25,$f11
	fmov $f11,$f22
	stt $f22,7($3)
	ldt $f22,15($3)
	subtsu $f22,$f23,$f10
	fmov $f10,$f22
	stt $f22,15($3)
	ldt $f22,23($3)
	subtsu $f22,$f12,$f11
	fmov $f11,$f22
	stt $f22,23($3)
	ldt $f10,7($2)
	addtsu $f10,$f25,$f11
	fmov $f11,$f10
	stt $f10,7($2)
	ldt $f10,15($2)
	addtsu $f10,$f23,$f11
	fmov $f11,$f10
	stt $f10,15($2)
	ldt $f10,23($2)
	addtsu $f12,$f10,$f11
	fmov $f11,$f12
	stt $f12,23($2)
	stq $6,128($16)
	stq $2,88($16)
	stq $3,96($16)
	stq $8,112($16)
	stq $22,120($16)
	stq $2,80($16)
	ret $31,($26),1
	.end ___H__20_test_2d_alpha2
.section	.sbss,"aw"
	.type	 ___lp,@object
	.size	 ___lp,8
	.align 3
___lp:
	.zero	8
.section	.bss
	.type	 ___glo_tbl,@object
	.size	 ___glo_tbl,32
	.align 3
___glo_tbl:
	.zero	32
	.ident	"GCC: (GNU) 2.96 19990922 (experimental)"

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]