This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Xtensa] avoid clobbering return address register


Stack unwinding for Xtensa requires that the return address register always be valid. There were a few places where this register was temporarily used for something else, and that was fine for normal exception handling. For forced unwinding from asynchronous thread termination in Linux, however, we need to avoid touching the return address. I've committed this patch to take care of that. I've tested it with a number of different Xtensa processor configurations, since several of the changes are specific to particular processor features.

2007-12-20 Bob Wilson <bob.wilson@acm.org>

	* config/xtensa/xtensa.md (fix_return_addr): Remove.
	* config/xtensa/xtensa-protos.h (xtensa_initialize_trampoline): New.
	(xtensa_trampoline_template): New.
	* config/xtensa/xtensa.c (MIN_FRAME_SIZE): Moved here from xtensa.h.
	(xtensa_return_addr): Expand to standard Xtensa insns instead of
	fix_return_addr.  Get high bits from a local label.
	(xtensa_trampoline_template): New function with code moved from
	TRAMPOLINE_TEMPLATE in xtensa.h.  Use L32R instead of CALL0 except
	when using CONST16 or absolute-mode literals.
	(xtensa_initialize_trampoline): New function with code moved from
	INITIALIZE_TRAMPOLINE in xtensa.h.  Use different offsets depending
	on which trampoline version is used.
	* config/xtensa/lib2funcs.S (TRAMPOLINE_SIZE): Add comment.
	* config/xtensa/xtensa.h (TARGET_ABSOLUTE_LITERALS): Define.
	(MIN_FRAME_SIZE): Moved to xtensa.c.
	(TRAMPOLINE_TEMPLATE): Use xtensa_trampoline_template.
	(TRAMPOLINE_SIZE): Two versions of the trampoline have different sizes.
	(INITIALIZE_TRAMPOLINE): Use xtensa_initialize_trampoline.
	* config/xtensa/ieee754-df.S (XCHAL_NO_MUL): Define.
	(__muldf3): Use CALL12 instead of CALL0 to invoke .Lmul_mulsi3
	helper when not using the CALL0 ABI.  Change .Lmul_mulsi3 to match.
	* config/xtensa/lib1funcs.asm (__umulsidi3): Likewise.
	* config/xtensa/ieee754-sf.S (__mulsf3): Likewise.
Index: config/xtensa/xtensa.c
===================================================================
--- config/xtensa/xtensa.c	(revision 131106)
+++ config/xtensa/xtensa.c	(working copy)
@@ -2301,6 +2301,10 @@
 }
 
 
+/* minimum frame = reg save area (4 words) plus static chain (1 word)
+   and the total number of words must be a multiple of 128 bits.  */
+#define MIN_FRAME_SIZE (8 * UNITS_PER_WORD)
+
 void
 xtensa_expand_prologue (void)
 {
@@ -2379,7 +2383,7 @@
 rtx
 xtensa_return_addr (int count, rtx frame)
 {
-  rtx result, retaddr;
+  rtx result, retaddr, curaddr, label;
 
   if (count == -1)
     retaddr = gen_rtx_REG (Pmode, A0_REG);
@@ -2393,10 +2397,25 @@
 
   /* The 2 most-significant bits of the return address on Xtensa hold
      the register window size.  To get the real return address, these
-     bits must be replaced with the high bits from the current PC.  */
+     bits must be replaced with the high bits from some address in the
+     code.  */
+
+  /* Get the 2 high bits of a local label in the code.  */
+  curaddr = gen_reg_rtx (Pmode);
+  label = gen_label_rtx ();
+  emit_label (label);
+  LABEL_PRESERVE_P (label) = 1;
+  emit_move_insn (curaddr, gen_rtx_LABEL_REF (Pmode, label));
+  emit_insn (gen_lshrsi3 (curaddr, curaddr, GEN_INT (30)));
+  emit_insn (gen_ashlsi3 (curaddr, curaddr, GEN_INT (30)));
 
+  /* Clear the 2 high bits of the return address.  */
   result = gen_reg_rtx (Pmode);
-  emit_insn (gen_fix_return_addr (result, retaddr));
+  emit_insn (gen_ashlsi3 (result, retaddr, GEN_INT (2)));
+  emit_insn (gen_lshrsi3 (result, result, GEN_INT (2)));
+
+  /* Combine them to get the result.  */
+  emit_insn (gen_iorsi3 (result, result, curaddr));
   return result;
 }
 
@@ -3126,4 +3145,95 @@
 	  > 4 * UNITS_PER_WORD);
 }
 
+
+/* TRAMPOLINE_TEMPLATE: For Xtensa, the trampoline must perform an ENTRY
+   instruction with a minimal stack frame in order to get some free
+   registers.  Once the actual call target is known, the proper stack frame
+   size is extracted from the ENTRY instruction at the target and the
+   current frame is adjusted to match.  The trampoline then transfers
+   control to the instruction following the ENTRY at the target.  Note:
+   this assumes that the target begins with an ENTRY instruction.  */
+
+void
+xtensa_trampoline_template (FILE *stream)
+{
+  bool use_call0 = (TARGET_CONST16 || TARGET_ABSOLUTE_LITERALS);
+
+  fprintf (stream, "\t.begin no-transform\n");
+  fprintf (stream, "\tentry\tsp, %d\n", MIN_FRAME_SIZE);
+
+  if (use_call0)
+    {
+      /* Save the return address.  */
+      fprintf (stream, "\tmov\ta10, a0\n");
+
+      /* Use a CALL0 instruction to skip past the constants and in the
+	 process get the PC into A0.  This allows PC-relative access to
+	 the constants without relying on L32R.  */
+      fprintf (stream, "\tcall0\t.Lskipconsts\n");
+    }
+  else
+    fprintf (stream, "\tj\t.Lskipconsts\n");
+
+  fprintf (stream, "\t.align\t4\n");
+  fprintf (stream, ".Lchainval:%s0\n", integer_asm_op (4, TRUE));
+  fprintf (stream, ".Lfnaddr:%s0\n", integer_asm_op (4, TRUE));
+  fprintf (stream, ".Lskipconsts:\n");
+
+  /* Load the static chain and function address from the trampoline.  */
+  if (use_call0)
+    {
+      fprintf (stream, "\taddi\ta0, a0, 3\n");
+      fprintf (stream, "\tl32i\ta9, a0, 0\n");
+      fprintf (stream, "\tl32i\ta8, a0, 4\n");
+    }
+  else
+    {
+      fprintf (stream, "\tl32r\ta9, .Lchainval\n");
+      fprintf (stream, "\tl32r\ta8, .Lfnaddr\n");
+    }
+
+  /* Store the static chain.  */
+  fprintf (stream, "\ts32i\ta9, sp, %d\n", MIN_FRAME_SIZE - 20);
+
+  /* Set the proper stack pointer value.  */
+  fprintf (stream, "\tl32i\ta9, a8, 0\n");
+  fprintf (stream, "\textui\ta9, a9, %d, 12\n",
+	   TARGET_BIG_ENDIAN ? 8 : 12);
+  fprintf (stream, "\tslli\ta9, a9, 3\n");
+  fprintf (stream, "\taddi\ta9, a9, %d\n", -MIN_FRAME_SIZE);
+  fprintf (stream, "\tsub\ta9, sp, a9\n");
+  fprintf (stream, "\tmovsp\tsp, a9\n");
+
+  if (use_call0)
+    /* Restore the return address.  */
+    fprintf (stream, "\tmov\ta0, a10\n");
+
+  /* Jump to the instruction following the ENTRY.  */
+  fprintf (stream, "\taddi\ta8, a8, 3\n");
+  fprintf (stream, "\tjx\ta8\n");
+
+  /* Pad size to a multiple of TRAMPOLINE_ALIGNMENT.  */
+  if (use_call0)
+    fprintf (stream, "\t.byte\t0\n");
+  else
+    fprintf (stream, "\tnop\n");
+
+  fprintf (stream, "\t.end no-transform\n");
+}
+
+
+void
+xtensa_initialize_trampoline (rtx addr, rtx func, rtx chain)
+{
+  bool use_call0 = (TARGET_CONST16 || TARGET_ABSOLUTE_LITERALS);
+  int chain_off = use_call0 ? 12 : 8;
+  int func_off = use_call0 ? 16 : 12;
+  emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, chain_off)), chain);
+  emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, func_off)), func);
+  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__xtensa_sync_caches"),
+		     0, VOIDmode, 1, addr, Pmode);
+}
+
+
 #include "gt-xtensa.h"
Index: config/xtensa/lib2funcs.S
===================================================================
--- config/xtensa/lib2funcs.S	(revision 131106)
+++ config/xtensa/lib2funcs.S	(working copy)
@@ -1,5 +1,5 @@
 /* Assembly functions for libgcc2.
-   Copyright (C) 2001, 2006 Free Software Foundation, Inc.
+   Copyright (C) 2001, 2006, 2007 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
 
 This file is part of GCC.
@@ -151,6 +151,7 @@
    make sure that the modified instructions are loaded into the instruction
    fetch buffer.  */
 
+/* Use the maximum trampoline size.  Flushing a bit extra is OK.  */
 #define TRAMPOLINE_SIZE 60
 
 	.text
Index: config/xtensa/xtensa.h
===================================================================
--- config/xtensa/xtensa.h	(revision 131106)
+++ config/xtensa/xtensa.h	(working copy)
@@ -72,6 +72,7 @@
 #define TARGET_ADDX		XCHAL_HAVE_ADDX
 #define TARGET_RELEASE_SYNC	XCHAL_HAVE_RELEASE_SYNC
 #define TARGET_S32C1I		XCHAL_HAVE_S32C1I
+#define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
 
 #define TARGET_DEFAULT (						\
   (XCHAL_HAVE_L32R	? 0 : MASK_CONST16))
@@ -704,83 +705,19 @@
 /* Stack pointer value doesn't matter at exit.  */
 #define EXIT_IGNORE_STACK 1
 
-/* A C statement to output, on the stream FILE, assembler code for a
-   block of data that contains the constant parts of a trampoline. 
-   This code should not include a label--the label is taken care of
-   automatically.
-
-   For Xtensa, the trampoline must perform an entry instruction with a
-   minimal stack frame in order to get some free registers.  Once the
-   actual call target is known, the proper stack frame size is extracted
-   from the entry instruction at the target and the current frame is
-   adjusted to match.  The trampoline then transfers control to the
-   instruction following the entry at the target.  Note: this assumes
-   that the target begins with an entry instruction.  */
-
-/* minimum frame = reg save area (4 words) plus static chain (1 word)
-   and the total number of words must be a multiple of 128 bits */
-#define MIN_FRAME_SIZE (8 * UNITS_PER_WORD)
-
-#define TRAMPOLINE_TEMPLATE(STREAM)					\
-  do {									\
-    fprintf (STREAM, "\t.begin no-transform\n");			\
-    fprintf (STREAM, "\tentry\tsp, %d\n", MIN_FRAME_SIZE);		\
-									\
-    /* save the return address */					\
-    fprintf (STREAM, "\tmov\ta10, a0\n");				\
-									\
-    /* Use a CALL0 instruction to skip past the constants and in the	\
-       process get the PC into A0.  This allows PC-relative access to	\
-       the constants without relying on L32R, which may not always be	\
-       available.  */							\
-									\
-    fprintf (STREAM, "\tcall0\t.Lskipconsts\n");			\
-    fprintf (STREAM, "\t.align\t4\n");					\
-    fprintf (STREAM, ".Lchainval:%s0\n", integer_asm_op (4, TRUE));	\
-    fprintf (STREAM, ".Lfnaddr:%s0\n", integer_asm_op (4, TRUE));	\
-    fprintf (STREAM, ".Lskipconsts:\n");				\
-									\
-    /* store the static chain */					\
-    fprintf (STREAM, "\taddi\ta0, a0, 3\n");				\
-    fprintf (STREAM, "\tl32i\ta8, a0, 0\n");				\
-    fprintf (STREAM, "\ts32i\ta8, sp, %d\n", MIN_FRAME_SIZE - 20);	\
-									\
-    /* set the proper stack pointer value */				\
-    fprintf (STREAM, "\tl32i\ta8, a0, 4\n");				\
-    fprintf (STREAM, "\tl32i\ta9, a8, 0\n");				\
-    fprintf (STREAM, "\textui\ta9, a9, %d, 12\n",			\
-	     TARGET_BIG_ENDIAN ? 8 : 12);				\
-    fprintf (STREAM, "\tslli\ta9, a9, 3\n");				\
-    fprintf (STREAM, "\taddi\ta9, a9, %d\n", -MIN_FRAME_SIZE);		\
-    fprintf (STREAM, "\tsub\ta9, sp, a9\n");				\
-    fprintf (STREAM, "\tmovsp\tsp, a9\n");				\
-									\
-    /* restore the return address */					\
-    fprintf (STREAM, "\tmov\ta0, a10\n");				\
-									\
-    /* jump to the instruction following the entry */			\
-    fprintf (STREAM, "\taddi\ta8, a8, 3\n");				\
-    fprintf (STREAM, "\tjx\ta8\n");					\
-    fprintf (STREAM, "\t.byte\t0\n");					\
-    fprintf (STREAM, "\t.end no-transform\n");				\
-  } while (0)
+#define TRAMPOLINE_TEMPLATE(STREAM) xtensa_trampoline_template (STREAM)
 
 /* Size in bytes of the trampoline, as an integer.  Make sure this is
    a multiple of TRAMPOLINE_ALIGNMENT to avoid -Wpadded warnings.  */
-#define TRAMPOLINE_SIZE 60
+#define TRAMPOLINE_SIZE (TARGET_CONST16 || TARGET_ABSOLUTE_LITERALS ? 60 : 52)
 
 /* Alignment required for trampolines, in bits.  */
-#define TRAMPOLINE_ALIGNMENT (32)
+#define TRAMPOLINE_ALIGNMENT 32
 
 /* A C statement to initialize the variable parts of a trampoline.  */
 #define INITIALIZE_TRAMPOLINE(ADDR, FUNC, CHAIN)			\
-  do {									\
-    rtx addr = ADDR;							\
-    emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, 12)), CHAIN); \
-    emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, 16)), FUNC); \
-    emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__xtensa_sync_caches"), \
-		       0, VOIDmode, 1, addr, Pmode);			\
-  } while (0)
+  xtensa_initialize_trampoline (ADDR, FUNC, CHAIN)
+
 
 /* If defined, a C expression that produces the machine-specific code
    to setup the stack so that arbitrary frames can be accessed.
Index: config/xtensa/ieee754-df.S
===================================================================
--- config/xtensa/ieee754-df.S	(revision 131106)
+++ config/xtensa/ieee754-df.S	(working copy)
@@ -1,5 +1,5 @@
 /* IEEE-754 double-precision functions for Xtensa
-   Copyright (C) 2006 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
 
    This file is part of GCC.
@@ -607,6 +607,10 @@
 #ifdef L_muldf3
 
 	/* Multiplication */
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
 __muldf3_aux:
 
 	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
@@ -728,13 +732,19 @@
 	.global	__muldf3
 	.type	__muldf3, @function
 __muldf3:
-	leaf_entry sp, 32
 #if __XTENSA_CALL0_ABI__
+	leaf_entry sp, 32
 	addi	sp, sp, -32
 	s32i	a12, sp, 16
 	s32i	a13, sp, 20
 	s32i	a14, sp, 24
 	s32i	a15, sp, 28
+#elif XCHAL_NO_MUL
+	/* This is not really a leaf function; allocate enough stack space
+	   to allow CALL12s to a helper function.  */
+	leaf_entry sp, 64
+#else
+	leaf_entry sp, 32
 #endif
 	movi	a6, 0x7ff00000
 
@@ -809,7 +819,7 @@
 	muluh	xh, xh, yh
 	add	xh, xh, a9
 
-#else
+#else /* ! XCHAL_HAVE_MUL32_HIGH */
 
 	/* Break the inputs into 16-bit chunks and compute 16 32-bit partial
 	   products.  These partial products are:
@@ -847,7 +857,7 @@
 
 	/* Save a7 since it is needed to hold a temporary value.  */
 	s32i	a7, sp, 4
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* Calling a separate multiply function will clobber a0 and requires
 	   use of a8 as a temporary, so save those values now.  (The function
 	   uses a custom ABI so nothing else needs to be saved.)  */
@@ -915,12 +925,21 @@
 #define set_arg_h(dst, src) \
 	srli	dst, src, 16
 
+#if __XTENSA_CALL0_ABI__
 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 	set_arg_ ## xhalf (a13, xreg); \
 	set_arg_ ## yhalf (a14, yreg); \
 	call0	.Lmul_mulsi3; \
 	mov	dst, a12
-#endif
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a14, xreg); \
+	set_arg_ ## yhalf (a15, yreg); \
+	call12	.Lmul_mulsi3; \
+	mov	dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
 
 	/* Add pp1 and pp2 into a10 with carry-out in a9.  */
 	do_mul(a10, xl, l, yl, h)	/* pp 1 */
@@ -1032,11 +1051,11 @@
 
 	/* Restore values saved on the stack during the multiplication.  */
 	l32i	a7, sp, 4
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	l32i	a0, sp, 0
 	l32i	a8, sp, 8
 #endif
-#endif
+#endif /* ! XCHAL_HAVE_MUL32_HIGH */
 
 	/* Shift left by 12 bits, unless there was a carry-out from the
 	   multiply, in which case, shift by 11 bits and increment the
@@ -1157,38 +1176,47 @@
 	movi	xl, 0
 	j	.Lmul_done
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if XCHAL_NO_MUL
 	
 	/* For Xtensa processors with no multiply hardware, this simplified
 	   version of _mulsi3 is used for multiplying 16-bit chunks of
-	   the floating-point mantissas.  It uses a custom ABI:	the inputs
-	   are passed in a13 and a14, the result is returned in a12, and
-	   a8 and a15 are clobbered.  */
+	   the floating-point mantissas.  When using CALL0, this function
+	   uses a custom ABI: the inputs are passed in a13 and a14, the
+	   result is returned in a12, and a8 and a15 are clobbered.  */
 	.align	4
 .Lmul_mulsi3:
-	movi	a12, 0
-.Lmul_mult_loop:
-	add	a15, a14, a12
-	extui	a8, a13, 0, 1
-	movnez	a12, a15, a8
-
-	do_addx2 a15, a14, a12, a15
-	extui	a8, a13, 1, 1
-	movnez	a12, a15, a8
-
-	do_addx4 a15, a14, a12, a15
-	extui	a8, a13, 2, 1
-	movnez	a12, a15, a8
-
-	do_addx8 a15, a14, a12, a15
-	extui	a8, a13, 3, 1
-	movnez	a12, a15, a8
-
-	srli	a13, a13, 4
-	slli	a14, a14, 4
-	bnez	a13, .Lmul_mult_loop
-	ret
-#endif /* !MUL16 && !MUL32 && !MAC16 */
+	leaf_entry sp, 16
+	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+	movi	\dst, 0
+1:	add	\tmp1, \src2, \dst
+	extui	\tmp2, \src1, 0, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx2 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 1, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx4 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 2, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx8 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 3, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	srli	\src1, \src1, 4
+	slli	\src2, \src2, 4
+	bnez	\src1, 1b
+	.endm
+#if __XTENSA_CALL0_ABI__
+	mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+	/* The result will be written into a2, so save that argument in a4.  */
+	mov	a4, a2
+	mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+	leaf_return
+#endif /* XCHAL_NO_MUL */
 #endif /* L_muldf3 */
 
 #ifdef L_divdf3
Index: config/xtensa/xtensa.md
===================================================================
--- config/xtensa/xtensa.md	(revision 131106)
+++ config/xtensa/xtensa.md	(working copy)
@@ -1666,21 +1666,6 @@
    (set_attr "mode"	"none")
    (set_attr "length"	"0")])
 
-;; The fix_return_addr pattern sets the high 2 bits of an address in a
-;; register to match the high bits of the current PC.
-(define_insn "fix_return_addr"
-  [(set (match_operand:SI 0 "register_operand" "=a")
-	(unspec:SI [(match_operand:SI 1 "register_operand" "r")]
-		   UNSPEC_RET_ADDR))
-   (clobber (match_scratch:SI 2 "=r"))
-   (clobber (match_scratch:SI 3 "=r"))]
-  ""
-  "mov\t%2, a0\;call0\t0f\;.align\t4\;0:\;mov\t%3, a0\;mov\ta0, %2\;\
-srli\t%3, %3, 30\;slli\t%0, %1, 2\;ssai\t2\;src\t%0, %3, %0"
-  [(set_attr "type"	"multi")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"24")])
-
 
 ;; Instructions for the Xtensa "boolean" option.
 
Index: config/xtensa/lib1funcs.asm
===================================================================
--- config/xtensa/lib1funcs.asm	(revision 131106)
+++ config/xtensa/lib1funcs.asm	(working copy)
@@ -201,17 +201,28 @@
 
 
 #ifdef L_umulsidi3
+
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
 	.align	4
 	.global	__umulsidi3
 	.type	__umulsidi3, @function
 __umulsidi3:
-	leaf_entry sp, 32
 #if __XTENSA_CALL0_ABI__
+	leaf_entry sp, 32
 	addi	sp, sp, -32
 	s32i	a12, sp, 16
 	s32i	a13, sp, 20
 	s32i	a14, sp, 24
 	s32i	a15, sp, 28
+#elif XCHAL_NO_MUL
+	/* This is not really a leaf function; allocate enough stack space
+	   to allow CALL12s to a helper function.  */
+	leaf_entry sp, 48
+#else
+	leaf_entry sp, 16
 #endif
 
 #ifdef __XTENSA_EB__
@@ -232,7 +243,7 @@
 
 #else /* ! MUL32_HIGH */
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* a0 and a8 will be clobbered by calling the multiply function
 	   but a8 is not used here and need not be saved.  */
 	s32i	a0, sp, 0
@@ -290,12 +301,21 @@
 #define set_arg_h(dst, src) \
 	srli	dst, src, 16
 
+#if __XTENSA_CALL0_ABI__
 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 	set_arg_ ## xhalf (a13, xreg); \
 	set_arg_ ## yhalf (a14, yreg); \
 	call0	.Lmul_mulsi3; \
 	mov	dst, a12
-#endif
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a14, xreg); \
+	set_arg_ ## yhalf (a15, yreg); \
+	call12	.Lmul_mulsi3; \
+	mov	dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
 
 	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
 	do_mul(a6, a2, l, a3, h)	/* pp 1 */
@@ -324,7 +344,7 @@
 
 #endif /* !MUL32_HIGH */
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* Restore the original return address.  */
 	l32i	a0, sp, 0
 #endif
@@ -337,38 +357,47 @@
 #endif
 	leaf_return
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if XCHAL_NO_MUL
 
 	/* For Xtensa processors with no multiply hardware, this simplified
 	   version of _mulsi3 is used for multiplying 16-bit chunks of
-	   the floating-point mantissas.  It uses a custom ABI:	the inputs
-	   are passed in a13 and a14, the result is returned in a12, and
-	   a8 and a15 are clobbered.  */
+	   the floating-point mantissas.  When using CALL0, this function
+	   uses a custom ABI: the inputs are passed in a13 and a14, the
+	   result is returned in a12, and a8 and a15 are clobbered.  */
 	.align	4
 .Lmul_mulsi3:
-	movi	a12, 0
-.Lmul_mult_loop:
-	add	a15, a14, a12
-	extui	a8, a13, 0, 1
-	movnez	a12, a15, a8
-
-	do_addx2 a15, a14, a12, a15
-	extui	a8, a13, 1, 1
-	movnez	a12, a15, a8
-
-	do_addx4 a15, a14, a12, a15
-	extui	a8, a13, 2, 1
-	movnez	a12, a15, a8
-
-	do_addx8 a15, a14, a12, a15
-	extui	a8, a13, 3, 1
-	movnez	a12, a15, a8
-
-	srli	a13, a13, 4
-	slli	a14, a14, 4
-	bnez	a13, .Lmul_mult_loop
-	ret
-#endif /* !MUL16 && !MUL32 && !MAC16 */
+	leaf_entry sp, 16
+	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+	movi	\dst, 0
+1:	add	\tmp1, \src2, \dst
+	extui	\tmp2, \src1, 0, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx2 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 1, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx4 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 2, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx8 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 3, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	srli	\src1, \src1, 4
+	slli	\src2, \src2, 4
+	bnez	\src1, 1b
+	.endm
+#if __XTENSA_CALL0_ABI__
+	mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+	/* The result will be written into a2, so save that argument in a4.  */
+	mov	a4, a2
+	mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+	leaf_return
+#endif /* XCHAL_NO_MUL */
 
 	.size	__umulsidi3, . - __umulsidi3
 
Index: config/xtensa/xtensa-protos.h
===================================================================
--- config/xtensa/xtensa-protos.h	(revision 131106)
+++ config/xtensa/xtensa-protos.h	(working copy)
@@ -69,6 +69,7 @@
 extern enum reg_class xtensa_secondary_reload_class (enum reg_class,
 						     enum machine_mode, rtx,
 						     int);
+extern void xtensa_initialize_trampoline (rtx, rtx, rtx);
 #endif /* RTX_CODE */
 
 #ifdef TREE_CODE
@@ -85,5 +86,6 @@
 extern int xtensa_frame_pointer_required (void);
 extern void xtensa_expand_prologue (void);
 extern void order_regs_for_local_alloc (void);
+extern void xtensa_trampoline_template (FILE *);
 
 #endif /* !__XTENSA_PROTOS_H__ */
Index: config/xtensa/ieee754-sf.S
===================================================================
--- config/xtensa/ieee754-sf.S	(revision 131106)
+++ config/xtensa/ieee754-sf.S	(working copy)
@@ -1,5 +1,5 @@
 /* IEEE-754 single-precision functions for Xtensa
-   Copyright (C) 2006 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
 
    This file is part of GCC.
@@ -488,6 +488,10 @@
 #ifdef L_mulsf3
 
 	/* Multiplication */
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
 __mulsf3_aux:
 
 	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
@@ -570,13 +574,19 @@
 	.global	__mulsf3
 	.type	__mulsf3, @function
 __mulsf3:
-	leaf_entry sp, 32
 #if __XTENSA_CALL0_ABI__
+	leaf_entry sp, 32
 	addi	sp, sp, -32
 	s32i	a12, sp, 16
 	s32i	a13, sp, 20
 	s32i	a14, sp, 24
 	s32i	a15, sp, 28
+#elif XCHAL_NO_MUL
+	/* This is not really a leaf function; allocate enough stack space
+	   to allow CALL12s to a helper function.  */
+	leaf_entry sp, 64
+#else
+	leaf_entry sp, 32
 #endif
 	movi	a6, 0x7f800000
 
@@ -633,7 +643,7 @@
 	   chunks can be extracted when setting up the arguments to the
 	   separate multiply function.  */
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* Calling a separate multiply function will clobber a0 and requires
 	   use of a8 as a temporary, so save those values now.  (The function
 	   uses a custom ABI so nothing else needs to be saved.)  */
@@ -693,12 +703,21 @@
 #define set_arg_h(dst, src) \
 	srli	dst, src, 16
 
+#if __XTENSA_CALL0_ABI__
 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 	set_arg_ ## xhalf (a13, xreg); \
 	set_arg_ ## yhalf (a14, yreg); \
 	call0	.Lmul_mulsi3; \
 	mov	dst, a12
-#endif
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a14, xreg); \
+	set_arg_ ## yhalf (a15, yreg); \
+	call12	.Lmul_mulsi3; \
+	mov	dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
 
 	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
 	do_mul(a6, a2, l, a3, h)	/* pp 1 */
@@ -724,12 +743,12 @@
 	do_mul(a2, a2, h, a3, h)	/* pp 3 */
 	add	a2, a2, a9
 	
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* Restore values saved on the stack during the multiplication.  */
 	l32i	a0, sp, 0
 	l32i	a8, sp, 4
 #endif
-#endif
+#endif /* ! XCHAL_HAVE_MUL32_HIGH */
 
 	/* Shift left by 9 bits, unless there was a carry-out from the
 	   multiply, in which case, shift by 8 bits and increment the
@@ -825,38 +844,47 @@
 	slli	a2, a2, 31
 	j	.Lmul_done
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if XCHAL_NO_MUL
 	
 	/* For Xtensa processors with no multiply hardware, this simplified
 	   version of _mulsi3 is used for multiplying 16-bit chunks of
-	   the floating-point mantissas.  It uses a custom ABI:	the inputs
-	   are passed in a13 and a14, the result is returned in a12, and
-	   a8 and a15 are clobbered.  */
+	   the floating-point mantissas.  When using CALL0, this function
+	   uses a custom ABI: the inputs are passed in a13 and a14, the
+	   result is returned in a12, and a8 and a15 are clobbered.  */
 	.align	4
 .Lmul_mulsi3:
-	movi	a12, 0
-.Lmul_mult_loop:
-	add	a15, a14, a12
-	extui	a8, a13, 0, 1
-	movnez	a12, a15, a8
-
-	do_addx2 a15, a14, a12, a15
-	extui	a8, a13, 1, 1
-	movnez	a12, a15, a8
-
-	do_addx4 a15, a14, a12, a15
-	extui	a8, a13, 2, 1
-	movnez	a12, a15, a8
-
-	do_addx8 a15, a14, a12, a15
-	extui	a8, a13, 3, 1
-	movnez	a12, a15, a8
-
-	srli	a13, a13, 4
-	slli	a14, a14, 4
-	bnez	a13, .Lmul_mult_loop
-	ret
-#endif /* !MUL16 && !MUL32 && !MAC16 */
+	leaf_entry sp, 16
+	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+	movi	\dst, 0
+1:	add	\tmp1, \src2, \dst
+	extui	\tmp2, \src1, 0, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx2 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 1, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx4 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 2, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx8 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 3, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	srli	\src1, \src1, 4
+	slli	\src2, \src2, 4
+	bnez	\src1, 1b
+	.endm
+#if __XTENSA_CALL0_ABI__
+	mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+	/* The result will be written into a2, so save that argument in a4.  */
+	mov	a4, a2
+	mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+	leaf_return
+#endif /* XCHAL_NO_MUL */
 #endif /* L_mulsf3 */
 
 #ifdef L_divsf3

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]