This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: vararg calls with libffi on x86_64

From: Richard Henderson <rth at redhat dot com>
To: Andreas Degert <ad at papyrus-gmbh dot de>
Cc: gcc-patches at gcc dot gnu dot org, Andrew Haley <aph at redhat dot com>, Jan Hubicka <hubicka at ucw dot cz>
Date: Tue, 3 May 2005 18:28:59 -0700
Subject: Re: vararg calls with libffi on x86_64
References: <E1DRZei-0000Tc-00@pluto.noname> <17010.31036.654251.593520@cuddles.cambridge.redhat.com> <20050429232348.GD8858@redhat.com> <87is1z99lz.fsf@pluto.noname>

On Wed, May 04, 2005 at 01:24:08AM +0200, Andreas Degert wrote:
> In the attached patch the number of needed SSE registers is used. But
> then I think it doesn't make much sense only to prevent the callee
> from saving all SSE registers.

Hmm.  I'm not a fan of some of the changes.

> The lower 8 bits of the flags were used for the ffi type. I changed
> that to only 4 bits (at the moment there are 15 types, any chance it
> gets more than 16?), and the other 4 bits are the number of used SSE
> registers. This is set in ffi_prep_cif_machdep(), and the trampoline
> is changed to put that number into rax.

It's just as easy to pass in the number as an argument from ffi_call.
Like in the attached.

> ffi_closure has changed because the trampoline is bigger now.. is that
> a compatibility problem?

Yes.  If you can't fit everything into 24 bytes, then you'll have to
"encode" that information by using different entry points in unix64.S.

> -		  reg_args->gpr[gprcount] = 0;
>  		  memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);

This change is technically wrong.  We're going to load all 8 bytes; 
we should initialize all 8 bytes.  If valgrind ever gets ported to
amd64, it'll complain at you.

> +	jmp     *%rdx			/* jump to first necessary movdqa */

Branch mispredictions due to indirect jump is worse than the extra
data movement, IIRC.  Yes, the ABI recommends something like this
for varargs, but I think that was a mistake.  0 or non-0 tests are
better.

For ffi_closure, note that there is one byte free.  You could use
clc/stc to initialize the carry flag and branch on that once you're
inside ffi_closure_unix64.  That gets you a 0/non-0 test easy.

> -	/* The first byte of the flags contains the FFI_TYPE.  */
> +	/* The first 4 bits of the flags contain the FFI_TYPE.  */
> +	andb    $0xf, %cl
>  	movzbl	%cl, %r10d

This was wrong, since it corrupts the value of %ecx for later
struct processing.  It should have been

	movl	%ecx, %r10d
	andl	$15, %r10d

But since we don't need to change the format of flags at all...


r~



Index: src/x86/ffi64.c
===================================================================
RCS file: /cvs/gcc/gcc/libffi/src/x86/ffi64.c,v
retrieving revision 1.8
diff -u -p -d -r1.8 ffi64.c
--- src/x86/ffi64.c	25 Dec 2004 09:54:40 -0000	1.8
+++ src/x86/ffi64.c	4 May 2005 00:59:21 -0000
@@ -42,7 +42,7 @@ struct register_args
 };
 
 extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-			     void *raddr, void (*fnaddr)());
+			     void *raddr, void (*fnaddr)(), unsigned ssecount);
 
 /* All reference to register classes here is identical to the code in
    gcc/config/i386/i386.c. Do *not* change one without the other.  */
@@ -424,7 +424,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), vo
     }
 
   ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn);
+		   cif->flags, rvalue, fn, ssecount);
 }
 
 
Index: src/x86/unix64.S
===================================================================
RCS file: /cvs/gcc/gcc/libffi/src/x86/unix64.S,v
retrieving revision 1.7
diff -u -p -d -r1.7 unix64.S
--- src/x86/unix64.S	27 Dec 2004 09:20:10 -0000	1.7
+++ src/x86/unix64.S	4 May 2005 00:59:21 -0000
@@ -53,6 +53,7 @@ ffi_call_unix64:
 .LUW1:
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
+	movl	%r9d, %eax		/* Set number of SSE registers.  */
 
 	/* Load up all argument registers.  */
 	movq	(%r10), %rdi
@@ -61,14 +62,9 @@ ffi_call_unix64:
 	movq	24(%r10), %rcx
 	movq	32(%r10), %r8
 	movq	40(%r10), %r9
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
+	testl	%eax, %eax
+	jnz	.Lload_sse
+.Lret_from_load_sse:
 
 	/* Deallocate the reg arg area.  */
 	leaq	176(%r10), %rsp
@@ -91,6 +87,21 @@ ffi_call_unix64:
 	addq	%r11, %r10
 	jmp	*%r10
 
+	/* Many times we can avoid loading any SSE registers at all.
+	   It's not worth an indirect jump to load the exact set of
+	   SSE registers needed; zero or all is a good compromise.  */
+	.align 2
+.Lload_sse:
+	movdqa	48(%r10), %xmm0
+	movdqa	64(%r10), %xmm1
+	movdqa	80(%r10), %xmm2
+	movdqa	96(%r10), %xmm3
+	movdqa	112(%r10), %xmm4
+	movdqa	128(%r10), %xmm5
+	movdqa	144(%r10), %xmm6
+	movdqa	160(%r10), %xmm7
+	jmp	.Lret_from_load_sse
+
 	.section .rodata
 .Lstore_table:
 	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */

Follow-Ups:
- Re: vararg calls with libffi on x86_64
  - From: Andreas Degert

References:
- Re: vararg calls with libffi on x86_64
  - From: Andreas Degert

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]