PATCH: Don't allocate space for SSE reg in reg save area if SSE is disabled
H.J. Lu
hjl.tools@gmail.com
Sat Aug 30 22:33:00 GMT 2008
On Thu, Aug 28, 2008 at 01:23:24PM -0400, Jakub Jelinek wrote:
> On Thu, Aug 28, 2008 at 10:04:49AM -0700, H.J. Lu wrote:
> > On Thu, Aug 28, 2008 at 09:57:45AM -0700, H.J. Lu wrote:
> > >
> > > We are allocating space for SSE reg in reg save area even if SSE is
> > > disabled. -mno-sse is used on Linux kernel. Here is patch to avoid it.
> > > OK for trunk and 4.3?
> > >
> >
> > Here is the updated patch with some optimization. OK for trunk and
> > 4.3?
>
> Does this actually work, even if e.g. there are no integer, but some
> floating point va_args?
> Say
> double foo (int n, ...)
> {
> float ret = 0.0;
> va_list ap;
> va_start (ap, n);
> for (int i = 0; i < n; i++)
> ret += va_arg (ap, double);
> va_end (ap);
> return 0;
> }
I added this testcase.
> I'm afraid it would just clobber wrong portion of the stack.
>
> E.g. setup_incoming_varargs_64 does:
> emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
> plus_constant (save_area,
> 8 * X86_64_REGPARM_MAX + 127)));
> where I'd expect you want with your patch ix86_varargs_gpr_size + 127.
> Similarly, ix86_va_start needs some changes too if the area for gpr
> registers is left out. Either change:
> t = build2 (MODIFY_EXPR, type, fpr,
> build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
> - if ix86_varargs_gpr_size is zero, then supposedly va_list doesn't escape
> and so we don't care about ABI effects. Or adjust ap.reg_save_area
> initialization (sav) instead, if ix86_varargs_gpr_size is 0 subtract
> 8*X86_64_REGPARM_MAX from it.
>
It turns out that ix86_varargs_gpr_size can't be 0 since va_arg
is processed before setup_incoming_varargs_64 is called. But
we can do is to skip the whole register save area if it is never
used. Also we shouldn't use X86_64_REGPARM_MAX/X86_64_SSE_REGPARM_MAX
since they depend on the current ABI.
Here is the updated patch for trunk. I am testing it on Linux/ia32
and Linux/Intel64. OK for trunk if all pass?
Thanks.
H.J.
----
gcc/
2008-08-29 H.J. Lu <hongjiu.lu@intel.com>
* config/i386/i386.c (X86_64_VARARGS_SIZE): Removed.
(setup_incoming_varargs_64): Set/check ix86_varargs_gpr_size
and ix86_varargs_fpr_size. Use ix86_varargs_gpr_size instead
of X86_64_REGPARM_MAX.
(ix86_va_start): Check ix86_varargs_gpr_size and
ix86_varargs_fpr_size instead of cfun->va_list_gpr_size and
cfun->va_list_fpr_size, respectively. Use ix86_varargs_gpr_size
instead of X86_64_REGPARM_MAX.
(ix86_gimplify_va_arg): Replace X86_64_REGPARM_MAX and
X86_64_SSE_REGPARM_MAX with REGPARM_64_MAX and
SSE_64_REGPARM_MAX. respectively.
(ix86_compute_frame_layout): Updated.
* config/i386/i386.h (REGPARM_64_MAX): New.
(SSE_64_REGPARM_MAX): Likewise.
(ix86_varargs_gpr_size): Likewise.
(ix86_varargs_fpr_size): Likewise.
(REGPARM_MAX): Updated.
(SSE_REGPARM_MAX): Likewise.
(machine_function): Rename save_varrargs_registers to
save_varargs_registers. Add varargs_gpr_size and varargs_fpr_size.
(ix86_save_varrargs_registers): Renamed to ...
(ix86_save_varargs_registers): This.
gcc/testsuite/
2008-08-29 H.J. Lu <hongjiu.lu@intel.com>
* gcc.target/i386/amd64-abi-3.c: New.
* gcc.target/i386/amd64-abi-4.c: Likewise.
* gcc.target/i386/amd64-abi-5.c: Likewise.
* gcc.target/i386/amd64-abi-6.c: Likewise.
--- gcc/config/i386/i386.c.sse 2008-08-28 12:41:30.000000000 -0700
+++ gcc/config/i386/i386.c 2008-08-29 08:04:04.000000000 -0700
@@ -1631,9 +1631,6 @@ rtx ix86_compare_op0 = NULL_RTX;
rtx ix86_compare_op1 = NULL_RTX;
rtx ix86_compare_emitted = NULL_RTX;
-/* Size of the register save area. */
-#define X86_64_VARARGS_SIZE (X86_64_REGPARM_MAX * UNITS_PER_WORD + X86_64_SSE_REGPARM_MAX * 16)
-
/* Define the structure for the machine field in struct function. */
struct stack_local_entry GTY(())
@@ -6291,11 +6288,20 @@ setup_incoming_varargs_64 (CUMULATIVE_AR
if((cum ? cum->call_abi : ix86_cfun_abi ()) != DEFAULT_ABI)
regparm = DEFAULT_ABI != SYSV_ABI ? X86_64_REGPARM_MAX : X64_REGPARM_MAX;
- if (! cfun->va_list_gpr_size && ! cfun->va_list_fpr_size)
+ /* GPR size of varargs save area. It is fixed since va_arg may use
+ it before we can determine if GPR should be saved. */
+ ix86_varargs_gpr_size = regparm * UNITS_PER_WORD;
+
+ /* FPR size of varargs save area. We don't need it if we don't pass
+ anything in SSE registers. */
+ if (cum->sse_nregs && cfun->va_list_fpr_size)
+ ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
+
+ if (! cfun->va_list_gpr_size && ! ix86_varargs_fpr_size)
return;
/* Indicate to allocate space on the stack for varargs save area. */
- ix86_save_varrargs_registers = 1;
+ ix86_save_varargs_registers = 1;
save_area = frame_pointer_rtx;
set = get_varargs_alias_set ();
@@ -6313,7 +6319,7 @@ setup_incoming_varargs_64 (CUMULATIVE_AR
x86_64_int_parameter_registers[i]));
}
- if (cum->sse_nregs && cfun->va_list_fpr_size)
+ if (ix86_varargs_fpr_size)
{
/* Now emit code to save SSE registers. The AX parameter contains number
of SSE parameter registers used to call this function. We use
@@ -6358,7 +6364,7 @@ setup_incoming_varargs_64 (CUMULATIVE_AR
tmp_reg = gen_reg_rtx (Pmode);
emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
plus_constant (save_area,
- 8 * X86_64_REGPARM_MAX + 127)));
+ ix86_varargs_gpr_size + 127)));
mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127));
MEM_NOTRAP_P (mem) = 1;
set_mem_alias_set (mem, set);
@@ -6468,7 +6474,9 @@ ix86_va_start (tree valist, rtx nextarg)
n_gpr = crtl->args.info.regno;
n_fpr = crtl->args.info.sse_regno;
- if (cfun->va_list_gpr_size)
+ gcc_assert (ix86_varargs_gpr_size);
+
+ if (ix86_varargs_gpr_size)
{
type = TREE_TYPE (gpr);
t = build2 (MODIFY_EXPR, type,
@@ -6477,11 +6485,11 @@ ix86_va_start (tree valist, rtx nextarg)
expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
}
- if (cfun->va_list_fpr_size)
+ if (ix86_varargs_fpr_size)
{
type = TREE_TYPE (fpr);
t = build2 (MODIFY_EXPR, type, fpr,
- build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
+ build_int_cst (type, n_fpr * 16 + ix86_varargs_gpr_size));
TREE_SIDE_EFFECTS (t) = 1;
expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
}
@@ -6496,7 +6504,7 @@ ix86_va_start (tree valist, rtx nextarg)
TREE_SIDE_EFFECTS (t) = 1;
expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
- if (cfun->va_list_gpr_size || cfun->va_list_fpr_size)
+ if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
{
/* Find the register save area.
Prologue of the function save it right above stack frame. */
@@ -6638,7 +6646,7 @@ ix86_gimplify_va_arg (tree valist, tree
if (needed_intregs)
{
t = build_int_cst (TREE_TYPE (gpr),
- (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
+ (REGPARM_64_MAX - needed_intregs + 1) * 8);
t = build2 (GE_EXPR, boolean_type_node, gpr, t);
t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
@@ -6647,8 +6655,9 @@ ix86_gimplify_va_arg (tree valist, tree
if (needed_sseregs)
{
t = build_int_cst (TREE_TYPE (fpr),
- (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
- + X86_64_REGPARM_MAX * 8);
+ (SSE_64_REGPARM_MAX
+ - needed_sseregs + 1) * 16
+ + REGPARM_64_MAX * 8);
t = build2 (GE_EXPR, boolean_type_node, fpr, t);
t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
@@ -7476,10 +7485,10 @@ ix86_compute_frame_layout (struct ix86_f
offset += frame->nregs * UNITS_PER_WORD;
/* Va-arg area */
- if (ix86_save_varrargs_registers)
+ if (ix86_save_varargs_registers)
{
- offset += X86_64_VARARGS_SIZE;
- frame->va_arg_size = X86_64_VARARGS_SIZE;
+ frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
+ offset += frame->va_arg_size;
}
else
frame->va_arg_size = 0;
--- gcc/config/i386/i386.h.sse 2008-08-28 12:41:30.000000000 -0700
+++ gcc/config/i386/i386.h 2008-08-29 08:03:56.000000000 -0700
@@ -1871,13 +1871,15 @@ do { \
#define X64_SSE_REGPARM_MAX 4
#define X86_32_SSE_REGPARM_MAX (TARGET_SSE ? 3 : 0)
-#define REGPARM_MAX (TARGET_64BIT ? (TARGET_64BIT_MS_ABI ? X64_REGPARM_MAX \
- : X86_64_REGPARM_MAX) \
- : X86_32_REGPARM_MAX)
-
-#define SSE_REGPARM_MAX (TARGET_64BIT ? (TARGET_64BIT_MS_ABI ? X64_SSE_REGPARM_MAX \
- : X86_64_SSE_REGPARM_MAX) \
- : X86_32_SSE_REGPARM_MAX)
+#define REGPARM_64_MAX (TARGET_64BIT_MS_ABI \
+ ? X64_REGPARM_MAX : X86_64_REGPARM_MAX)
+#define REGPARM_MAX (TARGET_64BIT \
+ ? REGPARM_64_MAX : X86_32_REGPARM_MAX)
+
+#define SSE_64_REGPARM_MAX (TARGET_64BIT_MS_ABI \
+ ? X64_SSE_REGPARM_MAX : X86_64_SSE_REGPARM_MAX)
+#define SSE_REGPARM_MAX (TARGET_64BIT \
+ ? SSE_64_REGPARM_MAX : X86_32_SSE_REGPARM_MAX)
#define MMX_REGPARM_MAX (TARGET_64BIT ? 0 : (TARGET_MMX ? 3 : 0))
@@ -2389,7 +2391,9 @@ struct machine_function GTY(())
{
struct stack_local_entry *stack_locals;
const char *some_ld_name;
- int save_varrargs_registers;
+ int save_varargs_registers;
+ int varargs_gpr_size;
+ int varargs_fpr_size;
int accesses_prev_frame;
int optimize_mode_switching[MAX_386_ENTITIES];
int needs_cld;
@@ -2415,7 +2419,9 @@ struct machine_function GTY(())
};
#define ix86_stack_locals (cfun->machine->stack_locals)
-#define ix86_save_varrargs_registers (cfun->machine->save_varrargs_registers)
+#define ix86_save_varargs_registers (cfun->machine->save_varargs_registers)
+#define ix86_varargs_gpr_size (cfun->machine->varargs_gpr_size)
+#define ix86_varargs_fpr_size (cfun->machine->varargs_fpr_size)
#define ix86_optimize_mode_switching (cfun->machine->optimize_mode_switching)
#define ix86_current_function_needs_cld (cfun->machine->needs_cld)
#define ix86_tls_descriptor_calls_expanded_in_cfun \
--- gcc/testsuite/gcc.target/i386/amd64-abi-3.c.sse 2008-08-28 14:53:34.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/amd64-abi-3.c 2008-08-28 15:01:50.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mno-sse" } */
+/* { dg-final { scan-assembler "subq\[\\t \]*\\\$88,\[\\t \]*%rsp" } } */
+/* { dg-final { scan-assembler-not "subq\[\\t \]*\\\$216,\[\\t \]*%rsp" } } */
+
+#include <stdarg.h>
+
+void foo (va_list va_arglist);
+
+void
+test (int a1, ...)
+{
+ va_list va_arglist;
+ va_start (va_arglist, a1);
+ foo (va_arglist);
+ va_end (va_arglist);
+}
--- gcc/testsuite/gcc.target/i386/amd64-abi-4.c.sse 2008-08-28 14:53:34.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/amd64-abi-4.c 2008-08-28 14:53:34.000000000 -0700
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mno-sse" } */
+
+#include <stdarg.h>
+#include <assert.h>
+
+int n1 = 30;
+int n2 = 324;
+void *n3 = (void *) &n2;
+int n4 = 407;
+
+int e1;
+int e2;
+void *e3;
+int e4;
+
+static void
+__attribute__((noinline))
+foo (va_list va_arglist)
+{
+ e2 = va_arg (va_arglist, int);
+ e3 = va_arg (va_arglist, void *);
+ e4 = va_arg (va_arglist, int);
+}
+
+static void
+__attribute__((noinline))
+test (int a1, ...)
+{
+ e1 = a1;
+ va_list va_arglist;
+ va_start (va_arglist, a1);
+ foo (va_arglist);
+ va_end (va_arglist);
+}
+
+int
+main ()
+{
+ test (n1, n2, n3, n4);
+ assert (n1 == e1);
+ assert (n2 == e2);
+ assert (n3 == e3);
+ assert (n4 == e4);
+ return 0;
+}
--- gcc/testsuite/gcc.target/i386/amd64-abi-5.c.sse 2008-08-29 07:33:07.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/amd64-abi-5.c 2008-08-28 15:22:12.000000000 -0700
@@ -0,0 +1,64 @@
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2" } */
+
+#include <stdarg.h>
+#include <assert.h>
+
+int n1 = 30;
+double n2 = 324;
+double n3 = 39494.94;
+double n4 = 407;
+double n5 = 32.304;
+double n6 = 394.14;
+double n7 = 4.07;
+double n8 = 32.4;
+double n9 = 314.194;
+double n10 = 0.1407;
+
+int e1;
+double e2;
+double e3;
+double e4;
+double e5;
+double e6;
+double e7;
+double e8;
+double e9;
+double e10;
+
+static void
+__attribute__((noinline))
+test (int a1, ...)
+{
+ e1 = a1;
+ va_list va_arglist;
+ va_start (va_arglist, a1);
+ e2 = va_arg (va_arglist, double);
+ e3 = va_arg (va_arglist, double);
+ e4 = va_arg (va_arglist, double);
+ e5 = va_arg (va_arglist, double);
+ e6 = va_arg (va_arglist, double);
+ e7 = va_arg (va_arglist, double);
+ e8 = va_arg (va_arglist, double);
+ e9 = va_arg (va_arglist, double);
+ e10 = va_arg (va_arglist, double);
+ va_end (va_arglist);
+}
+
+int
+main ()
+{
+ test (n1, n2, n3, n4, n5, n6, n7, n8, n9, n10);
+ assert (n1 == e1);
+ assert (n2 == e2);
+ assert (n3 == e3);
+ assert (n4 == e4);
+ assert (n5 == e5);
+ assert (n6 == e6);
+ assert (n7 == e7);
+ assert (n8 == e8);
+ assert (n9 == e9);
+ assert (n10 == e10);
+ return 0;
+}
--- gcc/testsuite/gcc.target/i386/amd64-abi-6.c.sse 2008-08-29 07:33:11.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/amd64-abi-6.c 2008-08-28 15:23:40.000000000 -0700
@@ -0,0 +1,71 @@
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2" } */
+
+#include <stdarg.h>
+#include <assert.h>
+
+int n1 = 30;
+double n2 = 324;
+double n3 = 39494.94;
+double n4 = 407;
+double n5 = 32.304;
+double n6 = 394.14;
+double n7 = 4.07;
+double n8 = 32.4;
+double n9 = 314.194;
+double n10 = 0.1407;
+
+int e1;
+double e2;
+double e3;
+double e4;
+double e5;
+double e6;
+double e7;
+double e8;
+double e9;
+double e10;
+
+static void
+__attribute__((noinline))
+foo (va_list va_arglist)
+{
+ e2 = va_arg (va_arglist, double);
+ e3 = va_arg (va_arglist, double);
+ e4 = va_arg (va_arglist, double);
+ e5 = va_arg (va_arglist, double);
+ e6 = va_arg (va_arglist, double);
+ e7 = va_arg (va_arglist, double);
+ e8 = va_arg (va_arglist, double);
+ e9 = va_arg (va_arglist, double);
+ e10 = va_arg (va_arglist, double);
+}
+
+static void
+__attribute__((noinline))
+test (int a1, ...)
+{
+ va_list va_arglist;
+ e1 = a1;
+ va_start (va_arglist, a1);
+ foo (va_arglist);
+ va_end (va_arglist);
+}
+
+int
+main ()
+{
+ test (n1, n2, n3, n4, n5, n6, n7, n8, n9, n10);
+ assert (n1 == e1);
+ assert (n2 == e2);
+ assert (n3 == e3);
+ assert (n4 == e4);
+ assert (n5 == e5);
+ assert (n6 == e6);
+ assert (n7 == e7);
+ assert (n8 == e8);
+ assert (n9 == e9);
+ assert (n10 == e10);
+ return 0;
+}
More information about the Gcc-patches
mailing list