This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
-fno-crossjumping not working + weird asm behavior
- From: Michael Buro <Michael dot Buro at shaw dot ca>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Sat, 08 May 2004 15:03:30 -0600
- Subject: -fno-crossjumping not working + weird asm behavior
Hello,
compiling the inner loop of a virtual machine (see source below) using
gcc-3.4 on x86 I get the following output when using computed gotos
and -O3:
...
.L49:
movl (%esi), %eax
addl $4, %esi
jmp *%eax
.L33:
incl -156(%ebp)
movl %ebx, %eax
sall $5, %eax
movl %ebx, (%edi)
subl %ebx, %eax
incl %eax
cmpl $3124999, -156(%ebp)
jle .L34
movl %eax, (%edi)
leal -12(%ebp), %esp
xorl %eax, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,7
.L32:
movl %ebx, (%edi)
movl %ebx, %edx
sall $4, %edx
subl %ebx, %edx
leal 2(%edx,%edx), %ebx
jmp .L49
.L31:
movl %ebx, (%edi)
leal 0(,%ebx,8), %eax
subl %ebx, %eax
leal 1(%ebx,%eax,4), %ebx
jmp .L49
.L30:
movl %ebx, (%edi)
leal 0(,%ebx,8), %eax
subl %ebx, %eax
.L46:
leal 1(,%eax,4), %ebx
jmp .L49
...
Every goto **pc++ first jumps to .L49. This costs cycles but is likely
the result of -fcrossjumping. However, when I switch it off using
-fno-crossjumping I get the following output:
.L43:
jmp *%eax
.L33:
incl -156(%ebp)
movl %ebx, %eax
sall $5, %eax
movl %ebx, (%edi)
subl %ebx, %eax
incl %eax
cmpl $3124999, -156(%ebp)
jle .L34
movl %eax, (%edi)
leal -12(%ebp), %esp
xorl %eax, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,7
.L32:
movl %ebx, (%edi)
movl %ebx, %eax
sall $4, %eax
subl %ebx, %eax
leal 2(%eax,%eax), %ebx
movl (%esi), %eax
addl $4, %esi
jmp .L43 // WASTING CYCLES (expected jmp *%eax)
.L31:
movl %ebx, (%edi)
leal 0(,%ebx,8), %ecx
movl (%esi), %eax
subl %ebx, %ecx
addl $4, %esi
leal 1(%ebx,%ecx,4), %ebx
jmp .L43 // WASTING CYCLES (expected jmp *%eax)
...
The jump is still there, but some of the code now is kept in the case
code. Funny. Why is that happening? It's wasting cycles and still does
crossjumping. I would expect the jmp .Lxx to go away when using
-fno-crossjumping.
Next thing I tried was "helping" gcc by generating jmp *%eax on the
spot using inline assembly (see the JMP macro below). Now I am getting
the following code which appears to be wrong because no code is
actually generated for the cases except for the indirect jump.
.p2align 4,,15
.L34:
.L2:
.L3:
.L4:
.L5:
.L6:
.L7:
.L8:
.L9:
.L10:
.L11:
.L12:
.L13:
.L14:
.L15:
.L16:
.L17:
.L18:
.L19:
.L20:
.L21:
.L22:
.L23:
.L24:
.L25:
.L26:
.L27:
.L28:
.L29:
.L30:
.L31:
.L32:
.L33:
movl -196(%ebp), %eax
#APP
jmp *%eax
#NO_APP
movl -208(%ebp), %eax
#APP
jmp *%eax
#NO_APP
movl -220(%ebp), %eax
#APP
jmp *%eax
#NO_APP
movl -232(%ebp), %eax
#APP
jmp *%eax
#NO_APP
movl -240(%ebp), %eax
#APP
jmp *%eax
#NO_APP
movl -248(%ebp), %eax
#APP
jmp *%eax
#NO_APP
....
Perhaps I am missing something crucial w.r.t. inline assembly. I
compiled the same code using icc and the code there looks OK. So my
questions are: how do I force gcc to generate jmp *%eax on the spot
and why is code missing when using asm() ?
Thanks.
/Michael Buro - University of Alberta
////////////////////////// test.c
#include <iostream>
using namespace std;
class X {
public:
int foo;
};
X context;
X *y = &context;
#if 1
#define JMP \
{ addr = *pc++; asm volatile("jmp *%0" : : "r" (addr) ) ; }
#else
#define JMP \
{ addr = *pc++; goto *addr; }
#endif
int main()
{
context.foo = 1;
register X *x = y;
//register X *x __asm__("esi") = y;
#if 0
// vanilla code: switch dispatch
for (int k=0; k < 100000000; k++) {
switch(k & 31) {
case 0: x->foo *= 0; x->foo += 1; break;
case 1: x->foo *= 1; x->foo += 1; break;
case 2: x->foo *= 2; x->foo += 1; break;
case 3: x->foo *= 3; x->foo += 1; break;
case 4: x->foo *= 4; x->foo += 1; break;
case 5: x->foo *= 5; x->foo += 1; break;
case 6: x->foo *= 6; x->foo += 1; break;
case 7: x->foo *= 7; x->foo += 1; break;
case 8: x->foo *= 8; x->foo += 1; break;
case 9: x->foo *= 9; x->foo += 1; break;
case 10:x->foo *= 10; x->foo += 1; break;
case 11:x->foo *= 11; x->foo += 4; break;
case 12:x->foo *= 12; x->foo += 1; break;
case 13:x->foo *= 13; x->foo += 1; break;
case 14:x->foo *= 14; x->foo += 1; break;
case 15:x->foo *= 15; x->foo += 1; break;
case 16:x->foo *= 16; x->foo += 1; break;
case 17:x->foo *= 17; x->foo += 1; break;
case 18:x->foo *= 18; x->foo += 1; break;
case 19:x->foo *= 19; x->foo += 1; break;
case 20:x->foo *= 20; x->foo += 1; break;
case 21:x->foo *= 21; x->foo += 3; break;
case 22:x->foo *= 22; x->foo += 1; break;
case 23:x->foo *= 23; x->foo += 1; break;
case 24:x->foo *= 24; x->foo += 1; break;
case 25:x->foo *= 25; x->foo += 1; break;
case 26:x->foo *= 26; x->foo += 1; break;
case 27:x->foo *= 27; x->foo += 1; break;
case 28:x->foo *= 28; x->foo += 1; break;
case 29:x->foo *= 29; x->foo += 1; break;
case 30:x->foo *= 30; x->foo += 2; break;
case 31:x->foo *= 31; x->foo += 1; break;
default:;
}
}
#else
int j = 0;
const void *a[] = {
&&case_0,
&&case_1,
&&case_2,
&&case_3,
&&case_4,
&&case_5,
&&case_6,
&&case_7,
&&case_8,
&&case_9,
&&case_10,
&&case_11,
&&case_12,
&&case_13,
&&case_14,
&&case_15,
&&case_16,
&&case_17,
&&case_18,
&&case_19,
&&case_20,
&&case_21,
&&case_22,
&&case_23,
&&case_24,
&&case_25,
&&case_26,
&&case_27,
&&case_28,
&&case_29,
&&case_30,
&&case_31
};
//register void **pc __asm__("esi") = &a[0];
loop:;
register const void **pc = &a[0];
register const void *addr;
#if 1
// computed goto dispatch
case_0: x->foo *= 0; x->foo += 1; goto **pc++;
case_1: x->foo *= 1; x->foo += 1; goto **pc++;
case_2: x->foo *= 2; x->foo += 1; goto **pc++;
case_3: x->foo *= 3; x->foo += 1; goto **pc++;
case_4: x->foo *= 4; x->foo += 1; goto **pc++;
case_5: x->foo *= 5; x->foo += 1; goto **pc++;
case_6: x->foo *= 6; x->foo += 1; goto **pc++;
case_7: x->foo *= 7; x->foo += 1; goto **pc++;
case_8: x->foo *= 8; x->foo += 1; goto **pc++;
case_9: x->foo *= 9; x->foo += 1; goto **pc++;
case_10:x->foo *= 10; x->foo += 1; goto **pc++;
case_11:x->foo *= 11; x->foo += 4; goto **pc++;
case_12:x->foo *= 12; x->foo += 1; goto **pc++;
case_13:x->foo *= 13; x->foo += 1; goto **pc++;
case_14:x->foo *= 14; x->foo += 1; goto **pc++;
case_15:x->foo *= 15; x->foo += 1; goto **pc++;
case_16:x->foo *= 16; x->foo += 1; goto **pc++;
case_17:x->foo *= 17; x->foo += 1; goto **pc++;
case_18:x->foo *= 18; x->foo += 1; goto **pc++;
case_19:x->foo *= 19; x->foo += 1; goto **pc++;
case_20:x->foo *= 20; x->foo += 1; goto **pc++;
case_21:x->foo *= 21; x->foo += 3; goto **pc++;
case_22:x->foo *= 22; x->foo += 1; goto **pc++;
case_23:x->foo *= 23; x->foo += 1; goto **pc++;
case_24:x->foo *= 24; x->foo += 1; goto **pc++;
case_25:x->foo *= 25; x->foo += 1; goto **pc++;
case_26:x->foo *= 26; x->foo += 1; goto **pc++;
case_27:x->foo *= 27; x->foo += 1; goto **pc++;
case_28:x->foo *= 28; x->foo += 1; goto **pc++;
case_29:x->foo *= 29; x->foo += 1; goto **pc++;
case_30:x->foo *= 30; x->foo += 2; goto **pc++;
case_31:x->foo *= 31; x->foo += 1; j++; if (j < 3125000) goto loop;
#else
// inline assembly computed goto
case_0: x->foo *= 0; x->foo += 1; JMP;
case_1: x->foo *= 1; x->foo += 1; JMP;
case_2: x->foo *= 2; x->foo += 1; JMP;
case_3: x->foo *= 3; x->foo += 1; JMP;
case_4: x->foo *= 4; x->foo += 1; JMP;
case_5: x->foo *= 5; x->foo += 1; JMP;
case_6: x->foo *= 6; x->foo += 1; JMP;
case_7: x->foo *= 7; x->foo += 1; JMP;
case_8: x->foo *= 8; x->foo += 1; JMP;
case_9: x->foo *= 9; x->foo += 1; JMP;
case_10:x->foo *= 10; x->foo += 1; JMP;
case_11:x->foo *= 11; x->foo += 4; JMP;
case_12:x->foo *= 12; x->foo += 1; JMP;
case_13:x->foo *= 13; x->foo += 1; JMP;
case_14:x->foo *= 14; x->foo += 1; JMP;
case_15:x->foo *= 15; x->foo += 1; JMP;
case_16:x->foo *= 16; x->foo += 1; JMP;
case_17:x->foo *= 17; x->foo += 1; JMP;
case_18:x->foo *= 18; x->foo += 1; JMP;
case_19:x->foo *= 19; x->foo += 1; JMP;
case_20:x->foo *= 20; x->foo += 1; JMP;
case_21:x->foo *= 21; x->foo += 3; JMP;
case_22:x->foo *= 22; x->foo += 1; JMP;
case_23:x->foo *= 23; x->foo += 1; JMP;
case_24:x->foo *= 24; x->foo += 1; JMP;
case_25:x->foo *= 25; x->foo += 1; JMP;
case_26:x->foo *= 26; x->foo += 1; JMP;
case_27:x->foo *= 27; x->foo += 1; JMP;
case_28:x->foo *= 28; x->foo += 1; JMP;
case_29:x->foo *= 29; x->foo += 1; JMP;
case_30:x->foo *= 30; x->foo += 2; JMP;
case_31:x->foo *= 31; x->foo += 1; j++; if (j < 3125000) goto loop;
end:;
#endif
#endif
return 0;
}