This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: vararg calls with libffi on x86_64
- From: Richard Henderson <rth at redhat dot com>
- To: Andreas Degert <ad at papyrus-gmbh dot de>
- Cc: gcc-patches at gcc dot gnu dot org, Andrew Haley <aph at redhat dot com>, Jan Hubicka <hubicka at ucw dot cz>
- Date: Tue, 3 May 2005 18:28:59 -0700
- Subject: Re: vararg calls with libffi on x86_64
- References: <E1DRZei-0000Tc-00@pluto.noname> <17010.31036.654251.593520@cuddles.cambridge.redhat.com> <20050429232348.GD8858@redhat.com> <87is1z99lz.fsf@pluto.noname>
On Wed, May 04, 2005 at 01:24:08AM +0200, Andreas Degert wrote:
> In the attached patch the number of needed SSE registers is used. But
> then I think it doesn't make much sense only to prevent the callee
> from saving all SSE registers.
Hmm. I'm not a fan of some of the changes.
> The lower 8 bits of the flags were used for the ffi type. I changed
> that to only 4 bits (at the moment there are 15 types, any chance it
> gets more than 16?), and the other 4 bits are the number of used SSE
> registers. This is set in ffi_prep_cif_machdep(), and the trampoline
> is changed to put that number into rax.
It's just as easy to pass in the number as an argument from ffi_call.
Like in the attached.
> ffi_closure has changed because the trampoline is bigger now.. is that
> a compatibility problem?
Yes. If you can't fit everything into 24 bytes, then you'll have to
"encode" that information by using different entry points in unix64.S.
> - reg_args->gpr[gprcount] = 0;
> memcpy (®_args->gpr[gprcount], a, size < 8 ? size : 8);
This change is technically wrong. We're going to load all 8 bytes;
we should initialize all 8 bytes. If valgrind ever gets ported to
amd64, it'll complain at you.
> + jmp *%rdx /* jump to first necessary movdqa */
Branch mispredictions due to indirect jump is worse than the extra
data movement, IIRC. Yes, the ABI recommends something like this
for varargs, but I think that was a mistake. 0 or non-0 tests are
better.
For ffi_closure, note that there is one byte free. You could use
clc/stc to initialize the carry flag and branch on that once you're
inside ffi_closure_unix64. That gets you a 0/non-0 test easy.
> - /* The first byte of the flags contains the FFI_TYPE. */
> + /* The first 4 bits of the flags contain the FFI_TYPE. */
> + andb $0xf, %cl
> movzbl %cl, %r10d
This was wrong, since it corrupts the value of %ecx for later
struct processing. It should have been
movl %ecx, %r10d
andl $15, %r10d
But since we don't need to change the format of flags at all...
r~
Index: src/x86/ffi64.c
===================================================================
RCS file: /cvs/gcc/gcc/libffi/src/x86/ffi64.c,v
retrieving revision 1.8
diff -u -p -d -r1.8 ffi64.c
--- src/x86/ffi64.c 25 Dec 2004 09:54:40 -0000 1.8
+++ src/x86/ffi64.c 4 May 2005 00:59:21 -0000
@@ -42,7 +42,7 @@ struct register_args
};
extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
- void *raddr, void (*fnaddr)());
+ void *raddr, void (*fnaddr)(), unsigned ssecount);
/* All reference to register classes here is identical to the code in
gcc/config/i386/i386.c. Do *not* change one without the other. */
@@ -424,7 +424,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), vo
}
ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
- cif->flags, rvalue, fn);
+ cif->flags, rvalue, fn, ssecount);
}
Index: src/x86/unix64.S
===================================================================
RCS file: /cvs/gcc/gcc/libffi/src/x86/unix64.S,v
retrieving revision 1.7
diff -u -p -d -r1.7 unix64.S
--- src/x86/unix64.S 27 Dec 2004 09:20:10 -0000 1.7
+++ src/x86/unix64.S 4 May 2005 00:59:21 -0000
@@ -53,6 +53,7 @@ ffi_call_unix64:
.LUW1:
movq %rdi, %r10 /* Save a copy of the register area. */
movq %r8, %r11 /* Save a copy of the target fn. */
+ movl %r9d, %eax /* Set number of SSE registers. */
/* Load up all argument registers. */
movq (%r10), %rdi
@@ -61,14 +62,9 @@ ffi_call_unix64:
movq 24(%r10), %rcx
movq 32(%r10), %r8
movq 40(%r10), %r9
- movdqa 48(%r10), %xmm0
- movdqa 64(%r10), %xmm1
- movdqa 80(%r10), %xmm2
- movdqa 96(%r10), %xmm3
- movdqa 112(%r10), %xmm4
- movdqa 128(%r10), %xmm5
- movdqa 144(%r10), %xmm6
- movdqa 160(%r10), %xmm7
+ testl %eax, %eax
+ jnz .Lload_sse
+.Lret_from_load_sse:
/* Deallocate the reg arg area. */
leaq 176(%r10), %rsp
@@ -91,6 +87,21 @@ ffi_call_unix64:
addq %r11, %r10
jmp *%r10
+ /* Many times we can avoid loading any SSE registers at all.
+ It's not worth an indirect jump to load the exact set of
+ SSE registers needed; zero or all is a good compromise. */
+ .align 2
+.Lload_sse:
+ movdqa 48(%r10), %xmm0
+ movdqa 64(%r10), %xmm1
+ movdqa 80(%r10), %xmm2
+ movdqa 96(%r10), %xmm3
+ movdqa 112(%r10), %xmm4
+ movdqa 128(%r10), %xmm5
+ movdqa 144(%r10), %xmm6
+ movdqa 160(%r10), %xmm7
+ jmp .Lret_from_load_sse
+
.section .rodata
.Lstore_table:
.long .Lst_void-.Lstore_table /* FFI_TYPE_VOID */