This is the mail archive of the mailing list for the GCC project.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: vararg calls with libffi on x86_64

On Wed, May 04, 2005 at 01:24:08AM +0200, Andreas Degert wrote:
> In the attached patch the number of needed SSE registers is used. But
> then I think it doesn't make much sense only to prevent the callee
> from saving all SSE registers.

Hmm.  I'm not a fan of some of the changes.

> The lower 8 bits of the flags were used for the ffi type. I changed
> that to only 4 bits (at the moment there are 15 types, any chance it
> gets more than 16?), and the other 4 bits are the number of used SSE
> registers. This is set in ffi_prep_cif_machdep(), and the trampoline
> is changed to put that number into rax.

It's just as easy to pass in the number as an argument from ffi_call.
Like in the attached.

> ffi_closure has changed because the trampoline is bigger now.. is that
> a compatibility problem?

Yes.  If you can't fit everything into 24 bytes, then you'll have to
"encode" that information by using different entry points in unix64.S.

> -		  reg_args->gpr[gprcount] = 0;
>  		  memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);

This change is technically wrong.  We're going to load all 8 bytes; 
we should initialize all 8 bytes.  If valgrind ever gets ported to
amd64, it'll complain at you.

> +	jmp     *%rdx			/* jump to first necessary movdqa */

Branch mispredictions due to indirect jump is worse than the extra
data movement, IIRC.  Yes, the ABI recommends something like this
for varargs, but I think that was a mistake.  0 or non-0 tests are

For ffi_closure, note that there is one byte free.  You could use
clc/stc to initialize the carry flag and branch on that once you're
inside ffi_closure_unix64.  That gets you a 0/non-0 test easy.

> -	/* The first byte of the flags contains the FFI_TYPE.  */
> +	/* The first 4 bits of the flags contain the FFI_TYPE.  */
> +	andb    $0xf, %cl
>  	movzbl	%cl, %r10d

This was wrong, since it corrupts the value of %ecx for later
struct processing.  It should have been

	movl	%ecx, %r10d
	andl	$15, %r10d

But since we don't need to change the format of flags at all...


Index: src/x86/ffi64.c
RCS file: /cvs/gcc/gcc/libffi/src/x86/ffi64.c,v
retrieving revision 1.8
diff -u -p -d -r1.8 ffi64.c
--- src/x86/ffi64.c	25 Dec 2004 09:54:40 -0000	1.8
+++ src/x86/ffi64.c	4 May 2005 00:59:21 -0000
@@ -42,7 +42,7 @@ struct register_args
 extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-			     void *raddr, void (*fnaddr)());
+			     void *raddr, void (*fnaddr)(), unsigned ssecount);
 /* All reference to register classes here is identical to the code in
    gcc/config/i386/i386.c. Do *not* change one without the other.  */
@@ -424,7 +424,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), vo
   ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn);
+		   cif->flags, rvalue, fn, ssecount);
Index: src/x86/unix64.S
RCS file: /cvs/gcc/gcc/libffi/src/x86/unix64.S,v
retrieving revision 1.7
diff -u -p -d -r1.7 unix64.S
--- src/x86/unix64.S	27 Dec 2004 09:20:10 -0000	1.7
+++ src/x86/unix64.S	4 May 2005 00:59:21 -0000
@@ -53,6 +53,7 @@ ffi_call_unix64:
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
+	movl	%r9d, %eax		/* Set number of SSE registers.  */
 	/* Load up all argument registers.  */
 	movq	(%r10), %rdi
@@ -61,14 +62,9 @@ ffi_call_unix64:
 	movq	24(%r10), %rcx
 	movq	32(%r10), %r8
 	movq	40(%r10), %r9
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
+	testl	%eax, %eax
+	jnz	.Lload_sse
 	/* Deallocate the reg arg area.  */
 	leaq	176(%r10), %rsp
@@ -91,6 +87,21 @@ ffi_call_unix64:
 	addq	%r11, %r10
 	jmp	*%r10
+	/* Many times we can avoid loading any SSE registers at all.
+	   It's not worth an indirect jump to load the exact set of
+	   SSE registers needed; zero or all is a good compromise.  */
+	.align 2
+	movdqa	48(%r10), %xmm0
+	movdqa	64(%r10), %xmm1
+	movdqa	80(%r10), %xmm2
+	movdqa	96(%r10), %xmm3
+	movdqa	112(%r10), %xmm4
+	movdqa	128(%r10), %xmm5
+	movdqa	144(%r10), %xmm6
+	movdqa	160(%r10), %xmm7
+	jmp	.Lret_from_load_sse
 	.section .rodata
 	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]