This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: vararg calls with libffi on x86_64
Richard Henderson <rth@redhat.com> writes:
[...]
> Two more small changes...
[... trampoline setup / replace subq with leaq]
Done, and patch appended (as you might have realized, I'm not an
expert in assembly language.. neither in English..; thank you very
much for your help, I learned quite a bit now). No more changes, I'm
off until next week ;-)
ciao
Andreas
Index: src/x86/ffi64.c
===================================================================
RCS file: /cvsroot/gcc/gcc/libffi/src/x86/ffi64.c,v
retrieving revision 1.8
diff -u -r1.8 ffi64.c
--- src/x86/ffi64.c 25 Dec 2004 09:54:40 -0000 1.8
+++ src/x86/ffi64.c 4 May 2005 20:52:30 -0000
@@ -42,7 +42,7 @@
};
extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
- void *raddr, void (*fnaddr)());
+ void *raddr, void (*fnaddr)(), unsigned ssecount);
/* All reference to register classes here is identical to the code in
gcc/config/i386/i386.c. Do *not* change one without the other. */
@@ -303,10 +303,9 @@
else if (sse0 && sse1)
flags |= 1 << 10;
/* Mark the true size of the structure. */
- flags |= cif->rtype->size << 11;
+ flags |= cif->rtype->size << 12;
}
}
- cif->flags = flags;
/* Go over all arguments and determine the way they should be passed.
If it's in a register and there is space for it, let that be so. If
@@ -331,6 +330,9 @@
ssecount += nsse;
}
}
+ if (ssecount)
+ flags |= 1 << 11;
+ cif->flags = flags;
cif->bytes = bytes;
return FFI_OK;
@@ -353,7 +355,7 @@
address then we need to make one. Note the setting of flags to
VOID above in ffi_prep_cif_machdep. */
ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
- && cif->flags == FFI_TYPE_VOID);
+ && (cif->flags & 0xff) == FFI_TYPE_VOID);
if (rvalue == NULL && ret_in_memory)
rvalue = alloca (cif->rtype->size);
@@ -424,7 +426,7 @@
}
ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
- cif->flags, rvalue, fn);
+ cif->flags, rvalue, fn, ssecount);
}
@@ -439,13 +441,18 @@
volatile unsigned short *tramp;
tramp = (volatile unsigned short *) &closure->tramp[0];
+
tramp[0] = 0xbb49; /* mov <code>, %r11 */
- tramp[5] = 0xba49; /* mov <data>, %r10 */
- tramp[10] = 0xff49; /* jmp *%r11 */
- tramp[11] = 0x00e3;
*(void * volatile *) &tramp[1] = ffi_closure_unix64;
+ tramp[5] = 0xba49; /* mov <data>, %r10 */
*(void * volatile *) &tramp[6] = closure;
+ /* Set the carry bit iff the function uses any sse registers.
+ This is clc or stc, together with the first byte of the jmp. */
+ tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
+
+ tramp[11] = 0xe3ff; /* jmp *%r11 */
+
closure->cif = cif;
closure->fun = fun;
closure->user_data = user_data;
Index: src/x86/unix64.S
===================================================================
RCS file: /cvsroot/gcc/gcc/libffi/src/x86/unix64.S,v
retrieving revision 1.7
diff -u -r1.7 unix64.S
--- src/x86/unix64.S 27 Dec 2004 09:20:10 -0000 1.7
+++ src/x86/unix64.S 4 May 2005 20:52:30 -0000
@@ -53,6 +53,7 @@
.LUW1:
movq %rdi, %r10 /* Save a copy of the register area. */
movq %r8, %r11 /* Save a copy of the target fn. */
+ movl %r9d, %eax /* Set number of SSE registers. */
/* Load up all argument registers. */
movq (%r10), %rdi
@@ -61,14 +62,9 @@
movq 24(%r10), %rcx
movq 32(%r10), %r8
movq 40(%r10), %r9
- movdqa 48(%r10), %xmm0
- movdqa 64(%r10), %xmm1
- movdqa 80(%r10), %xmm2
- movdqa 96(%r10), %xmm3
- movdqa 112(%r10), %xmm4
- movdqa 128(%r10), %xmm5
- movdqa 144(%r10), %xmm6
- movdqa 160(%r10), %xmm7
+ testl %eax, %eax
+ jnz .Lload_sse
+.Lret_from_load_sse:
/* Deallocate the reg arg area. */
leaq 176(%r10), %rsp
@@ -91,6 +87,21 @@
addq %r11, %r10
jmp *%r10
+ /* Many times we can avoid loading any SSE registers at all.
+ It's not worth an indirect jump to load the exact set of
+ SSE registers needed; zero or all is a good compromise. */
+ .align 2
+.Lload_sse:
+ movdqa 48(%r10), %xmm0
+ movdqa 64(%r10), %xmm1
+ movdqa 80(%r10), %xmm2
+ movdqa 96(%r10), %xmm3
+ movdqa 112(%r10), %xmm4
+ movdqa 128(%r10), %xmm5
+ movdqa 144(%r10), %xmm6
+ movdqa 160(%r10), %xmm7
+ jmp .Lret_from_load_sse
+
.section .rodata
.Lstore_table:
.long .Lst_void-.Lstore_table /* FFI_TYPE_VOID */
@@ -181,9 +192,9 @@
movq %rax, (%rsi)
movq %rdx, 8(%rsi)
- /* Bits 11-31 contain the true size of the structure. Copy from
+ /* Bits 12-31 contain the true size of the structure. Copy from
the scratch area to the true destination. */
- shrl $11, %ecx
+ shrl $12, %ecx
rep movsb
ret
.LUW3:
@@ -195,23 +206,18 @@
ffi_closure_unix64:
.LUW4:
- subq $200, %rsp
+ /* the carry flag is set by the trampoline iff SSE registers */
+ /* are used. Don't clobber it before the branch instruction. */
+ leaq -200(%rsp), %rsp
.LUW5:
-
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
- movdqa %xmm0, 48(%rsp)
- movdqa %xmm1, 64(%rsp)
- movdqa %xmm2, 80(%rsp)
- movdqa %xmm3, 96(%rsp)
- movdqa %xmm4, 112(%rsp)
- movdqa %xmm5, 128(%rsp)
- movdqa %xmm6, 144(%rsp)
- movdqa %xmm7, 160(%rsp)
+ jc .Lsave_sse
+.Lret_from_save_sse:
movq %r10, %rdi
leaq 176(%rsp), %rsi
@@ -230,6 +236,19 @@
addq %r11, %r10
jmp *%r10
+ /* Same comment as for loading SSE registers applies. see above */
+ .align 2
+.Lsave_sse:
+ movdqa %xmm0, 48(%rsp)
+ movdqa %xmm1, 64(%rsp)
+ movdqa %xmm2, 80(%rsp)
+ movdqa %xmm3, 96(%rsp)
+ movdqa %xmm4, 112(%rsp)
+ movdqa %xmm5, 128(%rsp)
+ movdqa %xmm6, 144(%rsp)
+ movdqa %xmm7, 160(%rsp)
+ jmp .Lret_from_save_sse
+
.section .rodata
.Lload_table:
.long .Lld_void-.Lload_table /* FFI_TYPE_VOID */