This is the mail archive of the
gcc-help@gcc.gnu.org
mailing list for the GCC project.
register allocation on IA64
- From: <john dot gill at quadrics dot com>
- To: <gcc-help at gcc dot gnu dot org>
- Date: Thu, 17 Mar 2005 17:24:43 -0000
- Subject: register allocation on IA64
I have a simple piece of code:
long
addem (long *ptr)
{
return (ptr[0] + ptr[1] + ptr[2] + ptr[3] + ptr[4] + ptr[5]);
}
This generates interleaved load and add instructions in a fairly
efficient way using a different register to hold a loaded value.
However, the additions of the loaded values are serialized through
r8 (the return value register).
These are the relevant instructions from the assembler output below:
add r8 = r8, r16
add r8 = r8, r17
add r8 = r8, r18
add r8 = r8, r15
add r8 = r8, r14
This does not seem to be the most efficient method because if I could
pair intermediate add results using a different (or reuse a) register
for the result I could obtain better packing of the insns in the VLIW slots.
However, this probably means using more registers to hold the
intermediate added values.
It seems to me that the insn scheduler has done the best
job it can given the registers allocation scheme it has inherited.
Is there any way to get gcc to be more aggressive with its register
allocation? Is there a compile time flag that I'm missing? I guess
this is something that would be of generic benefit to all machines
with lots of registers (I have noticed a similar problem for SPARC code).
I can of course rewrite the code so that gcc emits more efficient insns
but this is ugly and a nightmare to support long term ...
long
addem2 (long *ptr)
{
register long val0 = ptr[0];
register long val1 = ptr[1];
register long val2 = ptr[2];
register long val3 = ptr[3];
register long val4 = ptr[4];
register long val5 = ptr[5];
register long val6 = val0+val1;
register long val7 = val2+val3;
register long val8 = val4+val5;
register long val9 = val6+val7;
return (val8 + val9);
}
Thanks in advance
John
compiled with gcc -O2 -S add.c -o add.s gives:
add.s
==================================
.file "add.c"
.pred.safe_across_calls p1-p5,p16-p63
.text
.align 16
.global addem#
.proc addem#
addem:
.prologue
.body
adds r15 = 8, r32
adds r14 = 16, r32
ld8 r8 = [r32]
;;
ld8 r16 = [r15]
ld8 r17 = [r14]
adds r15 = 24, r32
;;
add r8 = r8, r16
adds r14 = 32, r32
ld8 r18 = [r15]
;;
add r8 = r8, r17 <<<<< serialized add
adds r32 = 40, r32
ld8 r15 = [r14]
;;
add r8 = r8, r18 <<<<< serialized add
ld8 r14 = [r32]
;;
add r8 = r8, r15 <<<<< serialized add
;;
add r8 = r8, r14 <<<<< serialized add
br.ret.sptk.many b0
.endp addem#
.global addem2#
.proc addem2#
addem2:
.prologue
.body
adds r18 = 24, r32
mov r14 = r32
adds r17 = 16, r32
;;
ld8 r19 = [r18]
ld8 r15 = [r14], 8
ld8 r16 = [r17]
adds r18 = 40, r32
;;
ld8 r17 = [r14]
add r16 = r16, r19 <<<<< paired add
adds r32 = 32, r32
;;
add r15 = r15, r17 <<<<< paired add
ld8 r14 = [r18]
ld8 r8 = [r32]
;;
add r15 = r15, r16 <<<<< paired add
add r8 = r8, r14 <<<<< paired add
;;
add r8 = r8, r15
br.ret.sptk.many b0
.ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.2 2.96-128.7.2)"