This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: vararg calls with libffi on x86_64


I've committed the following version.  I remembered after your
last version that we'd also need to update the unwind info.


r~


        * src/x86/ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in
        bit 11 of flags.
        (ffi_call): Mask return type field.  Pass ssecount to ffi_call_unix64.
        (ffi_prep_closure): Set carry bit if sse-used flag set.
        * src/x86/unix64.S (ffi_call_unix64): Add ssecount argument.
        Only load sse registers if ssecount non-zero.
        (ffi_closure_unix64): Only save sse registers if carry set on entry.

Index: src/x86/ffi64.c
===================================================================
RCS file: /cvs/gcc/gcc/libffi/src/x86/ffi64.c,v
retrieving revision 1.8
diff -u -p -d -r1.8 ffi64.c
--- src/x86/ffi64.c	25 Dec 2004 09:54:40 -0000	1.8
+++ src/x86/ffi64.c	5 May 2005 04:01:54 -0000
@@ -42,7 +42,7 @@ struct register_args
 };
 
 extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-			     void *raddr, void (*fnaddr)());
+			     void *raddr, void (*fnaddr)(), unsigned ssecount);
 
 /* All reference to register classes here is identical to the code in
    gcc/config/i386/i386.c. Do *not* change one without the other.  */
@@ -303,10 +303,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	  else if (sse0 && sse1)
 	    flags |= 1 << 10;
 	  /* Mark the true size of the structure.  */
-	  flags |= cif->rtype->size << 11;
+	  flags |= cif->rtype->size << 12;
 	}
     }
-  cif->flags = flags;
 
   /* Go over all arguments and determine the way they should be passed.
      If it's in a register and there is space for it, let that be so. If
@@ -331,6 +330,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	  ssecount += nsse;
 	}
     }
+  if (ssecount)
+    flags |= 1 << 11;
+  cif->flags = flags;
   cif->bytes = bytes;
 
   return FFI_OK;
@@ -353,7 +355,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), vo
      address then we need to make one.  Note the setting of flags to
      VOID above in ffi_prep_cif_machdep.  */
   ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
-		   && cif->flags == FFI_TYPE_VOID);
+		   && (cif->flags & 0xff) == FFI_TYPE_VOID);
   if (rvalue == NULL && ret_in_memory)
     rvalue = alloca (cif->rtype->size);
 
@@ -424,7 +426,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), vo
     }
 
   ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn);
+		   cif->flags, rvalue, fn, ssecount);
 }
 
 
@@ -439,13 +441,18 @@ ffi_prep_closure (ffi_closure* closure,
   volatile unsigned short *tramp;
 
   tramp = (volatile unsigned short *) &closure->tramp[0];
+
   tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
-  tramp[5] = 0xba49;		/* mov <data>, %r10	*/
-  tramp[10] = 0xff49;		/* jmp *%r11	*/
-  tramp[11] = 0x00e3;
   *(void * volatile *) &tramp[1] = ffi_closure_unix64;
+  tramp[5] = 0xba49;		/* mov <data>, %r10	*/
   *(void * volatile *) &tramp[6] = closure;
 
+  /* Set the carry bit iff the function uses any sse registers.
+     This is clc or stc, together with the first byte of the jmp.  */
+  tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
+
+  tramp[11] = 0xe3ff;			/* jmp *%r11    */
+
   closure->cif = cif;
   closure->fun = fun;
   closure->user_data = user_data;
Index: src/x86/unix64.S
===================================================================
RCS file: /cvs/gcc/gcc/libffi/src/x86/unix64.S,v
retrieving revision 1.7
diff -u -p -d -r1.7 unix64.S
--- src/x86/unix64.S	27 Dec 2004 09:20:10 -0000	1.7
+++ src/x86/unix64.S	5 May 2005 04:01:54 -0000
@@ -31,7 +31,7 @@
 .text
 
 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-                    void *raddr, void (*fnaddr)());
+	            void *raddr, void (*fnaddr)());
 
    Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
    for this function.  This has been allocated by ffi_call.  We also
@@ -39,7 +39,7 @@
 
 	.align	2
 	.globl	ffi_call_unix64
-        .type	ffi_call_unix64,@function
+	.type	ffi_call_unix64,@function
 
 ffi_call_unix64:
 .LUW0:
@@ -53,6 +53,7 @@ ffi_call_unix64:
 .LUW1:
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
+	movl	%r9d, %eax		/* Set number of SSE registers.  */
 
 	/* Load up all argument registers.  */
 	movq	(%r10), %rdi
@@ -61,14 +62,9 @@ ffi_call_unix64:
 	movq	24(%r10), %rcx
 	movq	32(%r10), %r8
 	movq	40(%r10), %r9
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
+	testl	%eax, %eax
+	jnz	.Lload_sse
+.Lret_from_load_sse:
 
 	/* Deallocate the reg arg area.  */
 	leaq	176(%r10), %rsp
@@ -181,37 +177,49 @@ ffi_call_unix64:
 	movq	%rax, (%rsi)
 	movq	%rdx, 8(%rsi)
 
-	/* Bits 11-31 contain the true size of the structure.  Copy from
+	/* Bits 12-31 contain the true size of the structure.  Copy from
 	   the scratch area to the true destination.  */
-	shrl	$11, %ecx
+	shrl	$12, %ecx
 	rep movsb
 	ret
+
+	/* Many times we can avoid loading any SSE registers at all.
+	   It's not worth an indirect jump to load the exact set of
+	   SSE registers needed; zero or all is a good compromise.  */
+	.align 2
 .LUW3:
+.Lload_sse:
+	movdqa	48(%r10), %xmm0
+	movdqa	64(%r10), %xmm1
+	movdqa	80(%r10), %xmm2
+	movdqa	96(%r10), %xmm3
+	movdqa	112(%r10), %xmm4
+	movdqa	128(%r10), %xmm5
+	movdqa	144(%r10), %xmm6
+	movdqa	160(%r10), %xmm7
+	jmp	.Lret_from_load_sse
+
+.LUW4:
 	.size    ffi_call_unix64,.-ffi_call_unix64
 
 	.align	2
 	.globl ffi_closure_unix64
-        .type	ffi_closure_unix64,@function
+	.type	ffi_closure_unix64,@function
 
 ffi_closure_unix64:
-.LUW4:
-	subq	$200, %rsp
 .LUW5:
-
+	/* The carry flag is set by the trampoline iff SSE registers
+	   are used.  Don't clobber it before the branch instruction.  */
+	leaq    -200(%rsp), %rsp
+.LUW6:
 	movq	%rdi, (%rsp)
-        movq    %rsi, 8(%rsp)
-        movq    %rdx, 16(%rsp)
-        movq    %rcx, 24(%rsp)
-        movq    %r8, 32(%rsp)
-        movq    %r9, 40(%rsp)
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
+	movq    %rsi, 8(%rsp)
+	movq    %rdx, 16(%rsp)
+	movq    %rcx, 24(%rsp)
+	movq    %r8, 32(%rsp)
+	movq    %r9, 40(%rsp)
+	jc      .Lsave_sse
+.Lret_from_save_sse:
 
 	movq	%r10, %rdi
 	leaq	176(%rsp), %rsi
@@ -221,7 +229,7 @@ ffi_closure_unix64:
 
 	/* Deallocate stack frame early; return value is now in redzone.  */
 	addq	$200, %rsp
-.LUW6:
+.LUW7:
 
 	/* The first byte of the return value contains the FFI_TYPE.  */
 	movzbl	%al, %r10d
@@ -300,7 +308,22 @@ ffi_closure_unix64:
 	movq	-24(%rsp), %rax
 	cmovnz	%rdx, %rax
 	ret
-.LUW7:
+
+	/* See the comment above .Lload_sse; the same logic applies here.  */
+	.align 2
+.LUW8:
+.Lsave_sse:
+	movdqa	%xmm0, 48(%rsp)
+	movdqa	%xmm1, 64(%rsp)
+	movdqa	%xmm2, 80(%rsp)
+	movdqa	%xmm3, 96(%rsp)
+	movdqa	%xmm4, 112(%rsp)
+	movdqa	%xmm5, 128(%rsp)
+	movdqa	%xmm6, 144(%rsp)
+	movdqa	%xmm7, 160(%rsp)
+	jmp	.Lret_from_save_sse
+
+.LUW9:
 	.size	ffi_closure_unix64,.-ffi_closure_unix64
 
 	.section	.eh_frame,"a",@progbits
@@ -327,24 +350,25 @@ ffi_closure_unix64:
 .LASFDE1:
 	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
 	.long	.LUW0-.			/* FDE initial location */
-	.long	.LUW3-.LUW0		/* FDE address range */
+	.long	.LUW4-.LUW0		/* FDE address range */
 	.uleb128 0x0			/* Augmentation size */
 
 	.byte	0x4			/* DW_CFA_advance_loc4 */
 	.long	.LUW1-.LUW0
 
-        /* New stack frame based off rbp.  This is a itty bit of unwind
-           trickery in that the CFA *has* changed.  There is no easy way
-           to describe it correctly on entry to the function.  Fortunately,
-           it doesn't matter too much since at all points we can correctly
-           unwind back to ffi_call.  Note that the location to which we
-           moved the return address is (the new) CFA-8, so from the
-           perspective of the unwind info, it hasn't moved.  */
+	/* New stack frame based off rbp.  This is a itty bit of unwind
+	   trickery in that the CFA *has* changed.  There is no easy way
+	   to describe it correctly on entry to the function.  Fortunately,
+	   it doesn't matter too much since at all points we can correctly
+	   unwind back to ffi_call.  Note that the location to which we
+	   moved the return address is (the new) CFA-8, so from the
+	   perspective of the unwind info, it hasn't moved.  */
 	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
 	.uleb128 6
 	.uleb128 32
 	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
 	.uleb128 2
+	.byte	0xa			/* DW_CFA_remember_state */
 
 	.byte	0x4			/* DW_CFA_advance_loc4 */
 	.long	.LUW2-.LUW1
@@ -352,23 +376,36 @@ ffi_closure_unix64:
 	.uleb128 7
 	.uleb128 8
 	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
+
+	.byte	0x4			/* DW_CFA_advance_loc4 */
+	.long	.LUW3-.LUW2
+	.byte	0xb			/* DW_CFA_restore_state */
+
 	.align 8
 .LEFDE1:
 .LSFDE3:
 	.long	.LEFDE3-.LASFDE3	/* FDE Length */
 .LASFDE3:
 	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-	.long	.LUW4-.			/* FDE initial location */
-	.long	.LUW7-.LUW4		/* FDE address range */
+	.long	.LUW5-.			/* FDE initial location */
+	.long	.LUW9-.LUW5		/* FDE address range */
 	.uleb128 0x0			/* Augmentation size */
+
 	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW5-.LUW4
+	.long	.LUW6-.LUW5
 	.byte	0xe			/* DW_CFA_def_cfa_offset */
 	.uleb128 208
+	.byte	0xa			/* DW_CFA_remember_state */
+
 	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW6-.LUW5
+	.long	.LUW7-.LUW6
 	.byte	0xe			/* DW_CFA_def_cfa_offset */
 	.uleb128 8
+
+	.byte	0x4			/* DW_CFA_advance_loc4 */
+	.long	.LUW8-.LUW7
+	.byte	0xb			/* DW_CFA_restore_state */
+
 	.align 8
 .LEFDE3:
 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]