This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[Xtensa] avoid clobbering return address register
- From: Bob Wilson <bwilson at tensilica dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Thu, 20 Dec 2007 14:40:18 -0800
- Subject: [Xtensa] avoid clobbering return address register
Stack unwinding for Xtensa requires that the return address register always be
valid. There were a few places where this register was temporarily used for
something else, and that was fine for normal exception handling. For forced
unwinding from asynchronous thread termination in Linux, however, we need to
avoid touching the return address. I've committed this patch to take care of
that. I've tested it with a number of different Xtensa processor
configurations, since several of the changes are specific to particular
processor features.
2007-12-20 Bob Wilson <bob.wilson@acm.org>
* config/xtensa/xtensa.md (fix_return_addr): Remove.
* config/xtensa/xtensa-protos.h (xtensa_initialize_trampoline): New.
(xtensa_trampoline_template): New.
* config/xtensa/xtensa.c (MIN_FRAME_SIZE): Moved here from xtensa.h.
(xtensa_return_addr): Expand to standard Xtensa insns instead of
fix_return_addr. Get high bits from a local label.
(xtensa_trampoline_template): New function with code moved from
TRAMPOLINE_TEMPLATE in xtensa.h. Use L32R instead of CALL0 except
when using CONST16 or absolute-mode literals.
(xtensa_initialize_trampoline): New function with code moved from
INITIALIZE_TRAMPOLINE in xtensa.h. Use different offsets depending
on which trampoline version is used.
* config/xtensa/lib2funcs.S (TRAMPOLINE_SIZE): Add comment.
* config/xtensa/xtensa.h (TARGET_ABSOLUTE_LITERALS): Define.
(MIN_FRAME_SIZE): Moved to xtensa.c.
(TRAMPOLINE_TEMPLATE): Use xtensa_trampoline_template.
(TRAMPOLINE_SIZE): Two versions of the trampoline have different sizes.
(INITIALIZE_TRAMPOLINE): Use xtensa_initialize_trampoline.
* config/xtensa/ieee754-df.S (XCHAL_NO_MUL): Define.
(__muldf3): Use CALL12 instead of CALL0 to invoke .Lmul_mulsi3
helper when not using the CALL0 ABI. Change .Lmul_mulsi3 to match.
* config/xtensa/lib1funcs.asm (__umulsidi3): Likewise.
* config/xtensa/ieee754-sf.S (__mulsf3): Likewise.
Index: config/xtensa/xtensa.c
===================================================================
--- config/xtensa/xtensa.c (revision 131106)
+++ config/xtensa/xtensa.c (working copy)
@@ -2301,6 +2301,10 @@
}
+/* minimum frame = reg save area (4 words) plus static chain (1 word)
+ and the total number of words must be a multiple of 128 bits. */
+#define MIN_FRAME_SIZE (8 * UNITS_PER_WORD)
+
void
xtensa_expand_prologue (void)
{
@@ -2379,7 +2383,7 @@
rtx
xtensa_return_addr (int count, rtx frame)
{
- rtx result, retaddr;
+ rtx result, retaddr, curaddr, label;
if (count == -1)
retaddr = gen_rtx_REG (Pmode, A0_REG);
@@ -2393,10 +2397,25 @@
/* The 2 most-significant bits of the return address on Xtensa hold
the register window size. To get the real return address, these
- bits must be replaced with the high bits from the current PC. */
+ bits must be replaced with the high bits from some address in the
+ code. */
+
+ /* Get the 2 high bits of a local label in the code. */
+ curaddr = gen_reg_rtx (Pmode);
+ label = gen_label_rtx ();
+ emit_label (label);
+ LABEL_PRESERVE_P (label) = 1;
+ emit_move_insn (curaddr, gen_rtx_LABEL_REF (Pmode, label));
+ emit_insn (gen_lshrsi3 (curaddr, curaddr, GEN_INT (30)));
+ emit_insn (gen_ashlsi3 (curaddr, curaddr, GEN_INT (30)));
+ /* Clear the 2 high bits of the return address. */
result = gen_reg_rtx (Pmode);
- emit_insn (gen_fix_return_addr (result, retaddr));
+ emit_insn (gen_ashlsi3 (result, retaddr, GEN_INT (2)));
+ emit_insn (gen_lshrsi3 (result, result, GEN_INT (2)));
+
+ /* Combine them to get the result. */
+ emit_insn (gen_iorsi3 (result, result, curaddr));
return result;
}
@@ -3126,4 +3145,95 @@
> 4 * UNITS_PER_WORD);
}
+
+/* TRAMPOLINE_TEMPLATE: For Xtensa, the trampoline must perform an ENTRY
+ instruction with a minimal stack frame in order to get some free
+ registers. Once the actual call target is known, the proper stack frame
+ size is extracted from the ENTRY instruction at the target and the
+ current frame is adjusted to match. The trampoline then transfers
+ control to the instruction following the ENTRY at the target. Note:
+ this assumes that the target begins with an ENTRY instruction. */
+
+void
+xtensa_trampoline_template (FILE *stream)
+{
+ bool use_call0 = (TARGET_CONST16 || TARGET_ABSOLUTE_LITERALS);
+
+ fprintf (stream, "\t.begin no-transform\n");
+ fprintf (stream, "\tentry\tsp, %d\n", MIN_FRAME_SIZE);
+
+ if (use_call0)
+ {
+ /* Save the return address. */
+ fprintf (stream, "\tmov\ta10, a0\n");
+
+ /* Use a CALL0 instruction to skip past the constants and in the
+ process get the PC into A0. This allows PC-relative access to
+ the constants without relying on L32R. */
+ fprintf (stream, "\tcall0\t.Lskipconsts\n");
+ }
+ else
+ fprintf (stream, "\tj\t.Lskipconsts\n");
+
+ fprintf (stream, "\t.align\t4\n");
+ fprintf (stream, ".Lchainval:%s0\n", integer_asm_op (4, TRUE));
+ fprintf (stream, ".Lfnaddr:%s0\n", integer_asm_op (4, TRUE));
+ fprintf (stream, ".Lskipconsts:\n");
+
+ /* Load the static chain and function address from the trampoline. */
+ if (use_call0)
+ {
+ fprintf (stream, "\taddi\ta0, a0, 3\n");
+ fprintf (stream, "\tl32i\ta9, a0, 0\n");
+ fprintf (stream, "\tl32i\ta8, a0, 4\n");
+ }
+ else
+ {
+ fprintf (stream, "\tl32r\ta9, .Lchainval\n");
+ fprintf (stream, "\tl32r\ta8, .Lfnaddr\n");
+ }
+
+ /* Store the static chain. */
+ fprintf (stream, "\ts32i\ta9, sp, %d\n", MIN_FRAME_SIZE - 20);
+
+ /* Set the proper stack pointer value. */
+ fprintf (stream, "\tl32i\ta9, a8, 0\n");
+ fprintf (stream, "\textui\ta9, a9, %d, 12\n",
+ TARGET_BIG_ENDIAN ? 8 : 12);
+ fprintf (stream, "\tslli\ta9, a9, 3\n");
+ fprintf (stream, "\taddi\ta9, a9, %d\n", -MIN_FRAME_SIZE);
+ fprintf (stream, "\tsub\ta9, sp, a9\n");
+ fprintf (stream, "\tmovsp\tsp, a9\n");
+
+ if (use_call0)
+ /* Restore the return address. */
+ fprintf (stream, "\tmov\ta0, a10\n");
+
+ /* Jump to the instruction following the ENTRY. */
+ fprintf (stream, "\taddi\ta8, a8, 3\n");
+ fprintf (stream, "\tjx\ta8\n");
+
+ /* Pad size to a multiple of TRAMPOLINE_ALIGNMENT. */
+ if (use_call0)
+ fprintf (stream, "\t.byte\t0\n");
+ else
+ fprintf (stream, "\tnop\n");
+
+ fprintf (stream, "\t.end no-transform\n");
+}
+
+
+void
+xtensa_initialize_trampoline (rtx addr, rtx func, rtx chain)
+{
+ bool use_call0 = (TARGET_CONST16 || TARGET_ABSOLUTE_LITERALS);
+ int chain_off = use_call0 ? 12 : 8;
+ int func_off = use_call0 ? 16 : 12;
+ emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, chain_off)), chain);
+ emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, func_off)), func);
+ emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__xtensa_sync_caches"),
+ 0, VOIDmode, 1, addr, Pmode);
+}
+
+
#include "gt-xtensa.h"
Index: config/xtensa/lib2funcs.S
===================================================================
--- config/xtensa/lib2funcs.S (revision 131106)
+++ config/xtensa/lib2funcs.S (working copy)
@@ -1,5 +1,5 @@
/* Assembly functions for libgcc2.
- Copyright (C) 2001, 2006 Free Software Foundation, Inc.
+ Copyright (C) 2001, 2006, 2007 Free Software Foundation, Inc.
Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
This file is part of GCC.
@@ -151,6 +151,7 @@
make sure that the modified instructions are loaded into the instruction
fetch buffer. */
+/* Use the maximum trampoline size. Flushing a bit extra is OK. */
#define TRAMPOLINE_SIZE 60
.text
Index: config/xtensa/xtensa.h
===================================================================
--- config/xtensa/xtensa.h (revision 131106)
+++ config/xtensa/xtensa.h (working copy)
@@ -72,6 +72,7 @@
#define TARGET_ADDX XCHAL_HAVE_ADDX
#define TARGET_RELEASE_SYNC XCHAL_HAVE_RELEASE_SYNC
#define TARGET_S32C1I XCHAL_HAVE_S32C1I
+#define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
#define TARGET_DEFAULT ( \
(XCHAL_HAVE_L32R ? 0 : MASK_CONST16))
@@ -704,83 +705,19 @@
/* Stack pointer value doesn't matter at exit. */
#define EXIT_IGNORE_STACK 1
-/* A C statement to output, on the stream FILE, assembler code for a
- block of data that contains the constant parts of a trampoline.
- This code should not include a label--the label is taken care of
- automatically.
-
- For Xtensa, the trampoline must perform an entry instruction with a
- minimal stack frame in order to get some free registers. Once the
- actual call target is known, the proper stack frame size is extracted
- from the entry instruction at the target and the current frame is
- adjusted to match. The trampoline then transfers control to the
- instruction following the entry at the target. Note: this assumes
- that the target begins with an entry instruction. */
-
-/* minimum frame = reg save area (4 words) plus static chain (1 word)
- and the total number of words must be a multiple of 128 bits */
-#define MIN_FRAME_SIZE (8 * UNITS_PER_WORD)
-
-#define TRAMPOLINE_TEMPLATE(STREAM) \
- do { \
- fprintf (STREAM, "\t.begin no-transform\n"); \
- fprintf (STREAM, "\tentry\tsp, %d\n", MIN_FRAME_SIZE); \
- \
- /* save the return address */ \
- fprintf (STREAM, "\tmov\ta10, a0\n"); \
- \
- /* Use a CALL0 instruction to skip past the constants and in the \
- process get the PC into A0. This allows PC-relative access to \
- the constants without relying on L32R, which may not always be \
- available. */ \
- \
- fprintf (STREAM, "\tcall0\t.Lskipconsts\n"); \
- fprintf (STREAM, "\t.align\t4\n"); \
- fprintf (STREAM, ".Lchainval:%s0\n", integer_asm_op (4, TRUE)); \
- fprintf (STREAM, ".Lfnaddr:%s0\n", integer_asm_op (4, TRUE)); \
- fprintf (STREAM, ".Lskipconsts:\n"); \
- \
- /* store the static chain */ \
- fprintf (STREAM, "\taddi\ta0, a0, 3\n"); \
- fprintf (STREAM, "\tl32i\ta8, a0, 0\n"); \
- fprintf (STREAM, "\ts32i\ta8, sp, %d\n", MIN_FRAME_SIZE - 20); \
- \
- /* set the proper stack pointer value */ \
- fprintf (STREAM, "\tl32i\ta8, a0, 4\n"); \
- fprintf (STREAM, "\tl32i\ta9, a8, 0\n"); \
- fprintf (STREAM, "\textui\ta9, a9, %d, 12\n", \
- TARGET_BIG_ENDIAN ? 8 : 12); \
- fprintf (STREAM, "\tslli\ta9, a9, 3\n"); \
- fprintf (STREAM, "\taddi\ta9, a9, %d\n", -MIN_FRAME_SIZE); \
- fprintf (STREAM, "\tsub\ta9, sp, a9\n"); \
- fprintf (STREAM, "\tmovsp\tsp, a9\n"); \
- \
- /* restore the return address */ \
- fprintf (STREAM, "\tmov\ta0, a10\n"); \
- \
- /* jump to the instruction following the entry */ \
- fprintf (STREAM, "\taddi\ta8, a8, 3\n"); \
- fprintf (STREAM, "\tjx\ta8\n"); \
- fprintf (STREAM, "\t.byte\t0\n"); \
- fprintf (STREAM, "\t.end no-transform\n"); \
- } while (0)
+#define TRAMPOLINE_TEMPLATE(STREAM) xtensa_trampoline_template (STREAM)
/* Size in bytes of the trampoline, as an integer. Make sure this is
a multiple of TRAMPOLINE_ALIGNMENT to avoid -Wpadded warnings. */
-#define TRAMPOLINE_SIZE 60
+#define TRAMPOLINE_SIZE (TARGET_CONST16 || TARGET_ABSOLUTE_LITERALS ? 60 : 52)
/* Alignment required for trampolines, in bits. */
-#define TRAMPOLINE_ALIGNMENT (32)
+#define TRAMPOLINE_ALIGNMENT 32
/* A C statement to initialize the variable parts of a trampoline. */
#define INITIALIZE_TRAMPOLINE(ADDR, FUNC, CHAIN) \
- do { \
- rtx addr = ADDR; \
- emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, 12)), CHAIN); \
- emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, 16)), FUNC); \
- emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__xtensa_sync_caches"), \
- 0, VOIDmode, 1, addr, Pmode); \
- } while (0)
+ xtensa_initialize_trampoline (ADDR, FUNC, CHAIN)
+
/* If defined, a C expression that produces the machine-specific code
to setup the stack so that arbitrary frames can be accessed.
Index: config/xtensa/ieee754-df.S
===================================================================
--- config/xtensa/ieee754-df.S (revision 131106)
+++ config/xtensa/ieee754-df.S (working copy)
@@ -1,5 +1,5 @@
/* IEEE-754 double-precision functions for Xtensa
- Copyright (C) 2006 Free Software Foundation, Inc.
+ Copyright (C) 2006, 2007 Free Software Foundation, Inc.
Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
This file is part of GCC.
@@ -607,6 +607,10 @@
#ifdef L_muldf3
/* Multiplication */
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
__muldf3_aux:
/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
@@ -728,13 +732,19 @@
.global __muldf3
.type __muldf3, @function
__muldf3:
- leaf_entry sp, 32
#if __XTENSA_CALL0_ABI__
+ leaf_entry sp, 32
addi sp, sp, -32
s32i a12, sp, 16
s32i a13, sp, 20
s32i a14, sp, 24
s32i a15, sp, 28
+#elif XCHAL_NO_MUL
+ /* This is not really a leaf function; allocate enough stack space
+ to allow CALL12s to a helper function. */
+ leaf_entry sp, 64
+#else
+ leaf_entry sp, 32
#endif
movi a6, 0x7ff00000
@@ -809,7 +819,7 @@
muluh xh, xh, yh
add xh, xh, a9
-#else
+#else /* ! XCHAL_HAVE_MUL32_HIGH */
/* Break the inputs into 16-bit chunks and compute 16 32-bit partial
products. These partial products are:
@@ -847,7 +857,7 @@
/* Save a7 since it is needed to hold a temporary value. */
s32i a7, sp, 4
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
/* Calling a separate multiply function will clobber a0 and requires
use of a8 as a temporary, so save those values now. (The function
uses a custom ABI so nothing else needs to be saved.) */
@@ -915,12 +925,21 @@
#define set_arg_h(dst, src) \
srli dst, src, 16
+#if __XTENSA_CALL0_ABI__
#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
set_arg_ ## xhalf (a13, xreg); \
set_arg_ ## yhalf (a14, yreg); \
call0 .Lmul_mulsi3; \
mov dst, a12
-#endif
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+ set_arg_ ## xhalf (a14, xreg); \
+ set_arg_ ## yhalf (a15, yreg); \
+ call12 .Lmul_mulsi3; \
+ mov dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
/* Add pp1 and pp2 into a10 with carry-out in a9. */
do_mul(a10, xl, l, yl, h) /* pp 1 */
@@ -1032,11 +1051,11 @@
/* Restore values saved on the stack during the multiplication. */
l32i a7, sp, 4
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
l32i a0, sp, 0
l32i a8, sp, 8
#endif
-#endif
+#endif /* ! XCHAL_HAVE_MUL32_HIGH */
/* Shift left by 12 bits, unless there was a carry-out from the
multiply, in which case, shift by 11 bits and increment the
@@ -1157,38 +1176,47 @@
movi xl, 0
j .Lmul_done
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if XCHAL_NO_MUL
/* For Xtensa processors with no multiply hardware, this simplified
version of _mulsi3 is used for multiplying 16-bit chunks of
- the floating-point mantissas. It uses a custom ABI: the inputs
- are passed in a13 and a14, the result is returned in a12, and
- a8 and a15 are clobbered. */
+ the floating-point mantissas. When using CALL0, this function
+ uses a custom ABI: the inputs are passed in a13 and a14, the
+ result is returned in a12, and a8 and a15 are clobbered. */
.align 4
.Lmul_mulsi3:
- movi a12, 0
-.Lmul_mult_loop:
- add a15, a14, a12
- extui a8, a13, 0, 1
- movnez a12, a15, a8
-
- do_addx2 a15, a14, a12, a15
- extui a8, a13, 1, 1
- movnez a12, a15, a8
-
- do_addx4 a15, a14, a12, a15
- extui a8, a13, 2, 1
- movnez a12, a15, a8
-
- do_addx8 a15, a14, a12, a15
- extui a8, a13, 3, 1
- movnez a12, a15, a8
-
- srli a13, a13, 4
- slli a14, a14, 4
- bnez a13, .Lmul_mult_loop
- ret
-#endif /* !MUL16 && !MUL32 && !MAC16 */
+ leaf_entry sp, 16
+ .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+ movi \dst, 0
+1: add \tmp1, \src2, \dst
+ extui \tmp2, \src1, 0, 1
+ movnez \dst, \tmp1, \tmp2
+
+ do_addx2 \tmp1, \src2, \dst, \tmp1
+ extui \tmp2, \src1, 1, 1
+ movnez \dst, \tmp1, \tmp2
+
+ do_addx4 \tmp1, \src2, \dst, \tmp1
+ extui \tmp2, \src1, 2, 1
+ movnez \dst, \tmp1, \tmp2
+
+ do_addx8 \tmp1, \src2, \dst, \tmp1
+ extui \tmp2, \src1, 3, 1
+ movnez \dst, \tmp1, \tmp2
+
+ srli \src1, \src1, 4
+ slli \src2, \src2, 4
+ bnez \src1, 1b
+ .endm
+#if __XTENSA_CALL0_ABI__
+ mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+ /* The result will be written into a2, so save that argument in a4. */
+ mov a4, a2
+ mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+ leaf_return
+#endif /* XCHAL_NO_MUL */
#endif /* L_muldf3 */
#ifdef L_divdf3
Index: config/xtensa/xtensa.md
===================================================================
--- config/xtensa/xtensa.md (revision 131106)
+++ config/xtensa/xtensa.md (working copy)
@@ -1666,21 +1666,6 @@
(set_attr "mode" "none")
(set_attr "length" "0")])
-;; The fix_return_addr pattern sets the high 2 bits of an address in a
-;; register to match the high bits of the current PC.
-(define_insn "fix_return_addr"
- [(set (match_operand:SI 0 "register_operand" "=a")
- (unspec:SI [(match_operand:SI 1 "register_operand" "r")]
- UNSPEC_RET_ADDR))
- (clobber (match_scratch:SI 2 "=r"))
- (clobber (match_scratch:SI 3 "=r"))]
- ""
- "mov\t%2, a0\;call0\t0f\;.align\t4\;0:\;mov\t%3, a0\;mov\ta0, %2\;\
-srli\t%3, %3, 30\;slli\t%0, %1, 2\;ssai\t2\;src\t%0, %3, %0"
- [(set_attr "type" "multi")
- (set_attr "mode" "SI")
- (set_attr "length" "24")])
-
;; Instructions for the Xtensa "boolean" option.
Index: config/xtensa/lib1funcs.asm
===================================================================
--- config/xtensa/lib1funcs.asm (revision 131106)
+++ config/xtensa/lib1funcs.asm (working copy)
@@ -201,17 +201,28 @@
#ifdef L_umulsidi3
+
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
.align 4
.global __umulsidi3
.type __umulsidi3, @function
__umulsidi3:
- leaf_entry sp, 32
#if __XTENSA_CALL0_ABI__
+ leaf_entry sp, 32
addi sp, sp, -32
s32i a12, sp, 16
s32i a13, sp, 20
s32i a14, sp, 24
s32i a15, sp, 28
+#elif XCHAL_NO_MUL
+ /* This is not really a leaf function; allocate enough stack space
+ to allow CALL12s to a helper function. */
+ leaf_entry sp, 48
+#else
+ leaf_entry sp, 16
#endif
#ifdef __XTENSA_EB__
@@ -232,7 +243,7 @@
#else /* ! MUL32_HIGH */
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
/* a0 and a8 will be clobbered by calling the multiply function
but a8 is not used here and need not be saved. */
s32i a0, sp, 0
@@ -290,12 +301,21 @@
#define set_arg_h(dst, src) \
srli dst, src, 16
+#if __XTENSA_CALL0_ABI__
#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
set_arg_ ## xhalf (a13, xreg); \
set_arg_ ## yhalf (a14, yreg); \
call0 .Lmul_mulsi3; \
mov dst, a12
-#endif
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+ set_arg_ ## xhalf (a14, xreg); \
+ set_arg_ ## yhalf (a15, yreg); \
+ call12 .Lmul_mulsi3; \
+ mov dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
/* Add pp1 and pp2 into a6 with carry-out in a9. */
do_mul(a6, a2, l, a3, h) /* pp 1 */
@@ -324,7 +344,7 @@
#endif /* !MUL32_HIGH */
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
/* Restore the original return address. */
l32i a0, sp, 0
#endif
@@ -337,38 +357,47 @@
#endif
leaf_return
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if XCHAL_NO_MUL
/* For Xtensa processors with no multiply hardware, this simplified
version of _mulsi3 is used for multiplying 16-bit chunks of
- the floating-point mantissas. It uses a custom ABI: the inputs
- are passed in a13 and a14, the result is returned in a12, and
- a8 and a15 are clobbered. */
+ the floating-point mantissas. When using CALL0, this function
+ uses a custom ABI: the inputs are passed in a13 and a14, the
+ result is returned in a12, and a8 and a15 are clobbered. */
.align 4
.Lmul_mulsi3:
- movi a12, 0
-.Lmul_mult_loop:
- add a15, a14, a12
- extui a8, a13, 0, 1
- movnez a12, a15, a8
-
- do_addx2 a15, a14, a12, a15
- extui a8, a13, 1, 1
- movnez a12, a15, a8
-
- do_addx4 a15, a14, a12, a15
- extui a8, a13, 2, 1
- movnez a12, a15, a8
-
- do_addx8 a15, a14, a12, a15
- extui a8, a13, 3, 1
- movnez a12, a15, a8
-
- srli a13, a13, 4
- slli a14, a14, 4
- bnez a13, .Lmul_mult_loop
- ret
-#endif /* !MUL16 && !MUL32 && !MAC16 */
+ leaf_entry sp, 16
+ .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+ movi \dst, 0
+1: add \tmp1, \src2, \dst
+ extui \tmp2, \src1, 0, 1
+ movnez \dst, \tmp1, \tmp2
+
+ do_addx2 \tmp1, \src2, \dst, \tmp1
+ extui \tmp2, \src1, 1, 1
+ movnez \dst, \tmp1, \tmp2
+
+ do_addx4 \tmp1, \src2, \dst, \tmp1
+ extui \tmp2, \src1, 2, 1
+ movnez \dst, \tmp1, \tmp2
+
+ do_addx8 \tmp1, \src2, \dst, \tmp1
+ extui \tmp2, \src1, 3, 1
+ movnez \dst, \tmp1, \tmp2
+
+ srli \src1, \src1, 4
+ slli \src2, \src2, 4
+ bnez \src1, 1b
+ .endm
+#if __XTENSA_CALL0_ABI__
+ mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+ /* The result will be written into a2, so save that argument in a4. */
+ mov a4, a2
+ mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+ leaf_return
+#endif /* XCHAL_NO_MUL */
.size __umulsidi3, . - __umulsidi3
Index: config/xtensa/xtensa-protos.h
===================================================================
--- config/xtensa/xtensa-protos.h (revision 131106)
+++ config/xtensa/xtensa-protos.h (working copy)
@@ -69,6 +69,7 @@
extern enum reg_class xtensa_secondary_reload_class (enum reg_class,
enum machine_mode, rtx,
int);
+extern void xtensa_initialize_trampoline (rtx, rtx, rtx);
#endif /* RTX_CODE */
#ifdef TREE_CODE
@@ -85,5 +86,6 @@
extern int xtensa_frame_pointer_required (void);
extern void xtensa_expand_prologue (void);
extern void order_regs_for_local_alloc (void);
+extern void xtensa_trampoline_template (FILE *);
#endif /* !__XTENSA_PROTOS_H__ */
Index: config/xtensa/ieee754-sf.S
===================================================================
--- config/xtensa/ieee754-sf.S (revision 131106)
+++ config/xtensa/ieee754-sf.S (working copy)
@@ -1,5 +1,5 @@
/* IEEE-754 single-precision functions for Xtensa
- Copyright (C) 2006 Free Software Foundation, Inc.
+ Copyright (C) 2006, 2007 Free Software Foundation, Inc.
Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
This file is part of GCC.
@@ -488,6 +488,10 @@
#ifdef L_mulsf3
/* Multiplication */
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
__mulsf3_aux:
/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
@@ -570,13 +574,19 @@
.global __mulsf3
.type __mulsf3, @function
__mulsf3:
- leaf_entry sp, 32
#if __XTENSA_CALL0_ABI__
+ leaf_entry sp, 32
addi sp, sp, -32
s32i a12, sp, 16
s32i a13, sp, 20
s32i a14, sp, 24
s32i a15, sp, 28
+#elif XCHAL_NO_MUL
+ /* This is not really a leaf function; allocate enough stack space
+ to allow CALL12s to a helper function. */
+ leaf_entry sp, 64
+#else
+ leaf_entry sp, 32
#endif
movi a6, 0x7f800000
@@ -633,7 +643,7 @@
chunks can be extracted when setting up the arguments to the
separate multiply function. */
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
/* Calling a separate multiply function will clobber a0 and requires
use of a8 as a temporary, so save those values now. (The function
uses a custom ABI so nothing else needs to be saved.) */
@@ -693,12 +703,21 @@
#define set_arg_h(dst, src) \
srli dst, src, 16
+#if __XTENSA_CALL0_ABI__
#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
set_arg_ ## xhalf (a13, xreg); \
set_arg_ ## yhalf (a14, yreg); \
call0 .Lmul_mulsi3; \
mov dst, a12
-#endif
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+ set_arg_ ## xhalf (a14, xreg); \
+ set_arg_ ## yhalf (a15, yreg); \
+ call12 .Lmul_mulsi3; \
+ mov dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
/* Add pp1 and pp2 into a6 with carry-out in a9. */
do_mul(a6, a2, l, a3, h) /* pp 1 */
@@ -724,12 +743,12 @@
do_mul(a2, a2, h, a3, h) /* pp 3 */
add a2, a2, a9
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
/* Restore values saved on the stack during the multiplication. */
l32i a0, sp, 0
l32i a8, sp, 4
#endif
-#endif
+#endif /* ! XCHAL_HAVE_MUL32_HIGH */
/* Shift left by 9 bits, unless there was a carry-out from the
multiply, in which case, shift by 8 bits and increment the
@@ -825,38 +844,47 @@
slli a2, a2, 31
j .Lmul_done
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if XCHAL_NO_MUL
/* For Xtensa processors with no multiply hardware, this simplified
version of _mulsi3 is used for multiplying 16-bit chunks of
- the floating-point mantissas. It uses a custom ABI: the inputs
- are passed in a13 and a14, the result is returned in a12, and
- a8 and a15 are clobbered. */
+ the floating-point mantissas. When using CALL0, this function
+ uses a custom ABI: the inputs are passed in a13 and a14, the
+ result is returned in a12, and a8 and a15 are clobbered. */
.align 4
.Lmul_mulsi3:
- movi a12, 0
-.Lmul_mult_loop:
- add a15, a14, a12
- extui a8, a13, 0, 1
- movnez a12, a15, a8
-
- do_addx2 a15, a14, a12, a15
- extui a8, a13, 1, 1
- movnez a12, a15, a8
-
- do_addx4 a15, a14, a12, a15
- extui a8, a13, 2, 1
- movnez a12, a15, a8
-
- do_addx8 a15, a14, a12, a15
- extui a8, a13, 3, 1
- movnez a12, a15, a8
-
- srli a13, a13, 4
- slli a14, a14, 4
- bnez a13, .Lmul_mult_loop
- ret
-#endif /* !MUL16 && !MUL32 && !MAC16 */
+ leaf_entry sp, 16
+ .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+ movi \dst, 0
+1: add \tmp1, \src2, \dst
+ extui \tmp2, \src1, 0, 1
+ movnez \dst, \tmp1, \tmp2
+
+ do_addx2 \tmp1, \src2, \dst, \tmp1
+ extui \tmp2, \src1, 1, 1
+ movnez \dst, \tmp1, \tmp2
+
+ do_addx4 \tmp1, \src2, \dst, \tmp1
+ extui \tmp2, \src1, 2, 1
+ movnez \dst, \tmp1, \tmp2
+
+ do_addx8 \tmp1, \src2, \dst, \tmp1
+ extui \tmp2, \src1, 3, 1
+ movnez \dst, \tmp1, \tmp2
+
+ srli \src1, \src1, 4
+ slli \src2, \src2, 4
+ bnez \src1, 1b
+ .endm
+#if __XTENSA_CALL0_ABI__
+ mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+ /* The result will be written into a2, so save that argument in a4. */
+ mov a4, a2
+ mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+ leaf_return
+#endif /* XCHAL_NO_MUL */
#endif /* L_mulsf3 */
#ifdef L_divsf3