This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: vararg calls with libffi on x86_64


Richard Henderson <rth@redhat.com> writes:

> On Fri, Apr 29, 2005 at 07:13:16PM +0100, Andrew Haley wrote:
>> I agree.  It's very little extra overhead for the non-varargs case,
>> and it'll make varargs functions work.
>
> Surely we can compute the correct value...

In the attached patch the number of needed SSE registers is used. But
then I think it doesn't make much sense only to prevent the callee
from saving all SSE registers.

All SSE registers where read when entering the closure via the
trampoline, and set when calling the function with ffi_call().

The lower 8 bits of the flags were used for the ffi type. I changed
that to only 4 bits (at the moment there are 15 types, any chance it
gets more than 16?), and the other 4 bits are the number of used SSE
registers. This is set in ffi_prep_cif_machdep(), and the trampoline
is changed to put that number into rax.

ffi_closure has changed because the trampoline is bigger now.. is that
a compatibility problem?


Jan Hubicka <hubicka@ucw.cz> writes:

[...]
>>  > Surely we can compute the correct value...
[...]
> It will cause program that never initialize FPU unit since it never uses
> FP to initialize it.  Not sure if this is actual problem with all the
> lazy FPU initialization stuff in nowdays.

After the patch an SSE register will only be used unconditionally when a
structure that can be put into registers is used as parameter or
return value.

ciao
Andreas

Index: src/x86/ffi64.c
===================================================================
RCS file: /cvsroot/gcc/gcc/libffi/src/x86/ffi64.c,v
retrieving revision 1.8
diff -u -r1.8 ffi64.c
--- src/x86/ffi64.c	25 Dec 2004 09:54:40 -0000	1.8
+++ src/x86/ffi64.c	3 May 2005 23:20:02 -0000
@@ -31,6 +31,19 @@
 
 #ifdef __x86_64__
 
+/* Layout of ffi_cif.flags (bits from low to high):
+   0-3    type of return value (FFI_TYPE_xxx)
+   4-7    number of SSE registers used for function parameters
+   8      when set: return value in %xmm0 / %rax
+   9      when set: return value in %rax  / %xmm0
+   10     when set: return value in %xmm0 / %xmm1
+   11-31  return value: size of structure
+*/
+
+#if FFI_TYPE_LAST > 15
+#error more than 15 ffi types, change layout of ffi_cif.flags
+#endif
+
 #define MAX_GPR_REGS 6
 #define MAX_SSE_REGS 8
 
@@ -306,7 +319,6 @@
 	  flags |= cif->rtype->size << 11;
 	}
     }
-  cif->flags = flags;
 
   /* Go over all arguments and determine the way they should be passed.
      If it's in a register and there is space for it, let that be so. If
@@ -332,6 +344,7 @@
 	}
     }
   cif->bytes = bytes;
+  cif->flags = flags | (ssecount << 4);
 
   return FFI_OK;
 }
@@ -353,7 +366,7 @@
      address then we need to make one.  Note the setting of flags to
      VOID above in ffi_prep_cif_machdep.  */
   ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
-		   && cif->flags == FFI_TYPE_VOID);
+		   && (cif->flags & 0xf) == FFI_TYPE_VOID);
   if (rvalue == NULL && ret_in_memory)
     rvalue = alloca (cif->rtype->size);
 
@@ -405,7 +418,6 @@
 		{
 		case X86_64_INTEGER_CLASS:
 		case X86_64_INTEGERSI_CLASS:
-		  reg_args->gpr[gprcount] = 0;
 		  memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
 		  gprcount++;
 		  break;
@@ -439,12 +451,14 @@
   volatile unsigned short *tramp;
 
   tramp = (volatile unsigned short *) &closure->tramp[0];
-  tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
-  tramp[5] = 0xba49;		/* mov <data>, %r10	*/
-  tramp[10] = 0xff49;		/* jmp *%r11	*/
-  tramp[11] = 0x00e3;
-  *(void * volatile *) &tramp[1] = ffi_closure_unix64;
-  *(void * volatile *) &tramp[6] = closure;
+  ((volatile char*)tramp)[0] = 0xb0;    /* mov <sse_cnt>, %ax     */
+  ((volatile char*)tramp)[1] = (cif->flags >> 4) & 0xf;
+  tramp[1] = 0xbb49;		/* mov <code>, %r11	*/
+  tramp[6] = 0xba49;		/* mov <data>, %r10	*/
+  tramp[11] = 0xff49;		/* jmp *%r11	*/
+  tramp[12] = 0x00e3;
+  *(void * volatile *) &tramp[2] = ffi_closure_unix64;
+  *(void * volatile *) &tramp[7] = closure;
 
   closure->cif = cif;
   closure->fun = fun;
Index: src/x86/ffitarget.h
===================================================================
RCS file: /cvsroot/gcc/gcc/libffi/src/x86/ffitarget.h,v
retrieving revision 1.1
diff -u -r1.1 ffitarget.h
--- src/x86/ffitarget.h	21 Oct 2003 19:07:52 -0000	1.1
+++ src/x86/ffitarget.h	3 May 2005 23:20:02 -0000
@@ -70,7 +70,7 @@
 #define FFI_CLOSURES 1
 
 #ifdef X86_64
-#define FFI_TRAMPOLINE_SIZE 24
+#define FFI_TRAMPOLINE_SIZE 26
 #define FFI_NATIVE_RAW_API 0
 #else
 #define FFI_TRAMPOLINE_SIZE 10
Index: src/x86/unix64.S
===================================================================
RCS file: /cvsroot/gcc/gcc/libffi/src/x86/unix64.S,v
retrieving revision 1.7
diff -u -r1.7 unix64.S
--- src/x86/unix64.S	27 Dec 2004 09:20:10 -0000	1.7
+++ src/x86/unix64.S	3 May 2005 23:20:02 -0000
@@ -55,20 +55,31 @@
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
 
 	/* Load up all argument registers.  */
-	movq	(%r10), %rdi
-	movq	8(%r10), %rsi
-	movq	16(%r10), %rdx
-	movq	24(%r10), %rcx
-	movq	32(%r10), %r8
+	shrl    $4, %edx		/* extract count of SSE registers */
+	andq    $0xf, %rdx
+	movq    %rdx, %rax		/* move to rax for vararg calls */
+	leaq    0(,%rdx,4), %rdi	/* movdqa (see below) has 5 bytes */
+	addq    %rdx, %rdi
+	leaq    .Lmovend1(%rip), %rdx
+	subq    %rdi, %rdx		/* move backwards by count */
+	leaq    48(%r10), %rdi
+	jmp     *%rdx			/* jump to first necessary movdqa */
+
+	movdqa	112(%rdi), %xmm7
+	movdqa	96(%rdi), %xmm6
+	movdqa	80(%rdi), %xmm5
+	movdqa	64(%rdi), %xmm4
+	movdqa	48(%rdi), %xmm3
+	movdqa	32(%rdi), %xmm2
+	movdqa	16(%rdi), %xmm1
+	movdqa	0(%rdi), %xmm0
+.Lmovend1:
 	movq	40(%r10), %r9
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
+	movq	32(%r10), %r8
+	movq	24(%r10), %rcx
+	movq	16(%r10), %rdx
+	movq	8(%r10), %rsi
+	movq	(%r10), %rdi
 
 	/* Deallocate the reg arg area.  */
 	leaq	176(%r10), %rsp
@@ -84,7 +95,8 @@
 	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
 .LUW2:
 
-	/* The first byte of the flags contains the FFI_TYPE.  */
+	/* The first 4 bits of the flags contain the FFI_TYPE.  */
+	andb    $0xf, %cl
 	movzbl	%cl, %r10d
 	leaq	.Lstore_table(%rip), %r11
 	movslq	(%r11, %r10, 4), %r10
@@ -204,15 +216,23 @@
         movq    %rcx, 24(%rsp)
         movq    %r8, 32(%rsp)
         movq    %r9, 40(%rsp)
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
-
+	movzbl  %al, %eax		/* al is count of SSE regs (set by trampoline) */
+	movq    %rax, %rdx
+	leaq    0(,%rdx,4), %rax	/* movdqa (see below) has 5 bytes */
+	addq    %rdx, %rax
+	leaq    .Lmovend2(%rip), %rdx
+	subq    %rax, %rdx		/* move backwards by count */
+	leaq    48(%rsp), %rax
+	jmp     *%rdx			/* jump to first necessary movdqa */
+	movdqa  %xmm7, 112(%rax)
+	movdqa  %xmm6, 96(%rax)
+	movdqa  %xmm5, 80(%rax)
+	movdqa  %xmm4, 64(%rax)
+	movdqa  %xmm3, 48(%rax)
+	movdqa  %xmm2, 32(%rax)
+	movdqa  %xmm1, 16(%rax)
+	movdqa  %xmm0, 0(%rax)
+.Lmovend2:
 	movq	%r10, %rdi
 	leaq	176(%rsp), %rsi
 	movq	%rsp, %rdx


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]