Bug 16890 - [cygwin/mingw32] __m128 automatic variables misaligned
Summary: [cygwin/mingw32] __m128 automatic variables misaligned
Status: RESOLVED INVALID
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.0.0
: P2 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2004-08-05 22:46 UTC by Danny Smith
Modified: 2004-08-19 09:43 UTC (History)
1 user (show)

See Also:
Host: i686-pc-mingw32
Target: i686-pc-mingw32
Build: i686-pc-mingw32
Known to work:
Known to fail:
Last reconfirmed: 2004-08-18 16:53:36


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Danny Smith 2004-08-05 22:46:48 UTC
The following produces a segfaulting exe when built
with

gcc -O2 -msse sse_align.c 

/* sse_align,c */

#include <stdio.h>
#include <xmmintrin.h>

union f4vector
{
  __m128 v;
  float f[4];
};

void foo()
{
   union f4vector a;
   union f4vector b;
   union f4vector c;

    a.f[0] = 1.0F; a.f[1] = 2.0F; a.f[2] = 3.0F; a.f[3] = 4.0F;
    b.f[0] = 5.0F; b.f[1] = 6.0F; b.f[2] = 7.0F; b.f[3] = 8.0F;   
    c.v = _mm_mul_ss (a.v, b.v);  
    printf ("%f\t\%f\t%f\t\%f\n", c.f[0], c.f[1], c.f[2], c.f[3]);
}

int main ()
{
  foo();
  return 0;
}

Here is the assembler output,  Note the movaps, mulss:
	.file	"sse_align.c"
	.section .rdata,"dr"
LC8:
	.ascii "%f\11%f\11%f\11%f\12\0"
	.text
	.align 4
	.p2align 2,,3
.globl _foo
	.def	_foo;	.scl	2;	.type	32;	.endef
_foo:
	pushl	%ebp
	movl	%esp, %ebp
	subl	$100, %esp
	movl	$0x3f800000, -24(%ebp)
	movl	$0x40000000, -20(%ebp)
	movl	$0x40400000, -16(%ebp)
	movl	$0x40800000, -12(%ebp)
	movl	$0x40a00000, -40(%ebp)
	movl	$0x40c00000, -36(%ebp)
	movl	$0x40e00000, -32(%ebp)
	movl	$0x41000000, -28(%ebp)
	movaps	-24(%ebp), %xmm0 <<<< not 16-byte aligned
	mulss	-40(%ebp), %xmm0 <<<<
	movaps	%xmm0, -56(%ebp) <<<<
	flds	-44(%ebp)
	fstpl	24(%esp)
	flds	-48(%ebp)
	fstpl	16(%esp)
	flds	-52(%ebp)
	fstpl	8(%esp)
	flds	-56(%ebp)
	fstpl	(%esp)
	pushl	$LC8
	call	_printf
	addl	$48, %esp
	leave
	ret
	.def	___main;	.scl	2;	.type	32;	.endef
	.p2align 2,,3
.globl _main
	.def	_main;	.scl	2;	.type	32;	.endef
_main:
	pushl	%ebp
	movl	$16, %eax
	movl	%esp, %ebp
	subl	$8, %esp
	andl	$-16, %esp
	call	__alloca
	call	___main
	call	_foo
	xorl	%eax, %eax
	leave
	ret
	.def	_printf;	.scl	3;	.type	32;	.endef


However, if I set preferred-stack-boundary to 3 (or 2), it works
Here is assembler output from
gcc -O2 -msse -mpreferred-stack-boundary=3 sse_align.c 

	.file	"sse_align.c"
	.section .rdata,"dr"
LC8:
	.ascii "%f\11%f\11%f\11%f\12\0"
	.text
	.align 4
	.p2align 2,,3
.globl _foo
	.def	_foo;	.scl	2;	.type	32;	.endef
_foo:
	pushl	%ebp
	movl	%esp, %ebp
	subl	$84, %esp
	movl	$0x3f800000, -16(%ebp)
	movl	$0x40000000, -12(%ebp)
	movl	$0x40400000, -8(%ebp)
	movl	$0x40800000, -4(%ebp)
	movl	$0x40a00000, -32(%ebp)
	movl	$0x40c00000, -28(%ebp)
	movl	$0x40e00000, -24(%ebp)
	movl	$0x41000000, -20(%ebp)
	movaps	-16(%ebp), %xmm0 <<<< OK
	mulss	-32(%ebp), %xmm0 <<<<
	movaps	%xmm0, -48(%ebp) <<<<
	flds	-36(%ebp)
	fstpl	24(%esp)
	flds	-40(%ebp)
	fstpl	16(%esp)
	flds	-44(%ebp)
	fstpl	8(%esp)
	flds	-48(%ebp)
	fstpl	(%esp)
	pushl	$LC8
	call	_printf
	addl	$40, %esp
	leave
	ret
	.def	___main;	.scl	2;	.type	32;	.endef
	.p2align 2,,3
.globl _main
	.def	_main;	.scl	2;	.type	32;	.endef
_main:
	pushl	%ebp
	movl	$16, %eax
	movl	%esp, %ebp
	andl	$-8, %esp
	call	__alloca
	call	___main
	call	_foo
	xorl	%eax, %eax
	leave
	ret
	.def	_printf;	.scl	3;	.type	32;	.endef


The testcase will also work (regardless of preferred stack
alignment) if I declare the three f4vector variables at file
scope or as static variables within the function, rather
than as automatics,

Is this bug peculiar to windows target (yet another stack
probe fallout)?

Danny
Comment 1 Matthew Daws 2004-08-18 16:04:57 UTC
As I have an interest in this family of bugs being fixed, I thought I'd chime in
here.  Firstly, with GCC 3.4.1, I cannot replicate the bug, and the assmebly
output is slightly different.  However, the MOVAPS and MULSS instructions are
the same.  I get the same result with GCC 3.3.3 (all using -O2 -msse), and using
MinGW, of course.

Now, my understanding of the assembly is as follows: in _main, the "andl $-16,
%esp" will align the stack to 16-bytes.  We then call __alloca and ___main,
which for the moment I'll assume do not mess with the stack.  Then we call _foo
which pushes 4 bytes to the stack.  Then, in _foo, we push %ebp, so 8 bytes are
now on the stack, and then set %ebp=%esp.  Thus %ebp+8 is aligned to a 16-byte
boundary.  This makes sense, as then the MOVAPS and MULSS instructions ARE aligned.

I don't see why your test case crashes then.  My only thought is that ___main
and __alloca somehow DO mess with the stack: this might explain how the binary
you produce dies and mine doesn't.

Can anyone confirm or correct my analysis?  Danny: I take it you are using gcc
3.5.0?

Thanks, --Matt Daws
Comment 2 Matthew Daws 2004-08-18 16:49:03 UTC
Sorry: I have confirmed my analysis, at least in some sense.  If I manually
inline the function foo() into main() (i.e. cut-n-paste) then the resulting
binary does segfault.  This is because the assembly output is the same, but the
stack is now actually aligned (rather than aligned +/-8).  Of course, this
follows from the assembly output: the "andl $16, %%esp" aligns the stack, but
*we have already* copied %esp to %ebp, so that %ebp is not aligned in main().

After playing with threads in MinGW, one gets similar problems in threads. 
Basically, it seems that proper functions get a stack aligned +/-8, but that
main() and any function called via a new thread command will have a stack which
is truly aligned.  Of course, the compiler cannot know what is a true function,
and what will be called by a thread.

Can anyone tell me if this is correct?

--Matt
Comment 3 Andrew Pinski 2004-08-18 16:53:36 UTC
Lets confirm this bug.
Comment 4 Matthew Daws 2004-08-18 17:41:36 UTC
I have access to a linux box with "gcc -v" giving:

Reading specs from /usr/lib/gcc-lib/i386-redhat-linux/3.2.2/specs
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man
--infodir=/usr/share/info --enable-shared --enable-threads=posix
--disable-checking --with-system-zlib --enable-__cxa_atexit --host=i386-redhat-linux
Thread model: posix
gcc version 3.2.2 20030222 (Red Hat Linux 3.2.2-5)

Sorry, it's not mine though, so I cannot update.  I cannot replicate the
segfault, but the assembly output is again the same (indeed, it seems identical
to the code Danny gave).

I have inserted the following:

   unsigned int aa,bb;
   asm("movl %%esp,%%eax; movl %%ebp,%%ebx;" : "=a"(aa), "=b"(bb) );
   printf("ESP=%u (%u), EBP=%u (%u).\n",aa,aa&15,bb,bb&15);

Which tells us what ESP and EBP are.  Under linux, EBP is always 8 mod 16, as
expected.  Under Windows, this only true when foo() is a separate function,
while in main(), EBP is 0 mod 16, leading to unaligned accesses.

I suspect that this is a MinGW issue, though I really don't know enough about
how MinGW interacts with Windows...  I would guess that the internal code which
calls main() needs altering so that ESP is 0 mod 16 before the supposed "call
_main" instruction.

--Matt
Comment 5 Danny Smith 2004-08-18 21:19:18 UTC
Thanks Matt,
Your analysis has helped  a lot. It is indeed only a bug when the code is 
inlined into main. The bug is in the mingw CRT startup code. I testing a fix 
for that now. 

Danny
Comment 6 Matthew Daws 2004-08-19 09:43:48 UTC
Danny,
Excellent!  My only worry is that the problem is also going to show up with new
threads; or I do not know how to use threads correctly with MinGW: is it enough
to use _beginthreadex, or is there some special MinGW helper function I should
be calling, so that it sets up the stack correctly?