Bug 50444 - [4.6 Regression] -ftree-sra ignores alignment
Summary: [4.6 Regression] -ftree-sra ignores alignment
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.6.2
: P2 normal
Target Milestone: 4.7.0
Assignee: Richard Biener
URL:
Keywords: wrong-code
Depends on:
Blocks:
 
Reported: 2011-09-17 15:34 UTC by John Salmon
Modified: 2013-04-12 16:17 UTC (History)
7 users (show)

See Also:
Host:
Target:
Build:
Known to work: 4.5.3, 4.7.0
Known to fail: 4.6.0
Last reconfirmed: 2011-09-25 00:00:00


Attachments
C example (480 bytes, text/x-csrc)
2011-09-17 15:34 UTC, John Salmon
Details
patch-in-progress (1.36 KB, patch)
2012-01-18 11:23 UTC, Martin Jambor
Details | Diff
other candidate patch (1.49 KB, patch)
2012-01-20 14:16 UTC, Richard Biener
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description John Salmon 2011-09-17 15:34:18 UTC
Created attachment 25309 [details]
C example

On Linux, x86_64, the attached code segfaults when compiled with
gcc4.6.1 and gcc4.6.0.  There's no problem with gcc-4.5.3, nor
when -O1 is removed.

gdb reports that the instruction at the pc when it segfaults is:

   movdqa %xmm0, 0x28(%rbx)

and that 0x28(%rbx) is only 8-bit aligned.  It looks like the
alignment requirement of the movdqa instruction has somehow been
"forgotten" in the optimization that inlined the call to both caster()
and ssefunc() and eliminated the memcpys.

salmonj@drdlogin0039.en.desres$ desres-cleanenv -m gcc/4.6.1-23A/bin gcc -Wall -O1 e2.c
salmonj@drdlogin0039.en.desres$ a.out
Segmentation fault (core dumped)
salmonj@drdlogin0039.en.desres$ gdb a.out
GNU gdb (GDB) 7.0.1
Copyright (C) 2009 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-unknown-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /d/en/salmonj-0/junk/misalign/a.out...done.
(gdb) r
Starting program: /d/en/salmonj-0/junk/misalign/a.out 
warning: no loadable sections found in added symbol-file system-supplied DSO at 0x2aaaaaaab000

Program received signal SIGSEGV, Segmentation fault.
_mm_xor_si128 (e=0x7fffffffc070) at e2.c:51
51	        e->v = caster(*incr(&e->c), e->key);
(gdb) x /10i $pc
0x400504 <method+28>:	movdqa %xmm0,0x28(%rbx)
0x400509 <method+33>:	movq   $0x4,0x20(%rbx)
0x400511 <method+41>:	mov    0x20(%rbx),%rax
0x400515 <method+45>:	lea    -0x1(%rax),%rdx
0x400519 <method+49>:	mov    %rdx,0x20(%rbx)
0x40051d <method+53>:	mov    0x24(%rbx,%rax,4),%eax
0x400521 <method+57>:	pop    %rbx
0x400522 <method+58>:	retq   
0x400523 <main>:	push   %rbx
0x400524 <main+1>:	add    $0xffffffffffffff80,%rsp
(gdb) info reg
rax            0x7fffffffc080	140737488339072
rbx            0x7fffffffc070	140737488339056
rcx            0x400570	4195696
rdx            0x7fffffffc1a8	140737488339368
rsi            0x7fffffffc198	140737488339352
rdi            0x7fffffffc080	140737488339072
rbp            0x0	0x0
rsp            0x7fffffffc020	0x7fffffffc020
r8             0x3f8b3532d0	272918459088
r9             0x3f8ac0d730	272910833456
r10            0x0	0
r11            0x3f8b01d8a0	272915093664
r12            0x0	0
r13            0x7fffffffc190	140737488339344
r14            0x0	0
r15            0x0	0
rip            0x400504	0x400504 <method+28>
eflags         0x10202	[ IF RF ]
cs             0x33	51
ss             0x2b	43
ds             0x0	0
es             0x0	0
fs             0x0	0
gs             0x0	0
fctrl          0x37f	895
fstat          0x0	0
ftag           0xffff	65535
fiseg          0x0	0
fioff          0x0	0
foseg          0x0	0
fooff          0x0	0
fop            0x0	0
mxcsr          0x1f80	[ IM DM ZM OM UM PM ]
(gdb)
Comment 1 John Salmon 2011-09-25 15:22:07 UTC
Here's a slightly smaller test case.  The problem is the 'movdqa'.  According the x86-64 ABI, rsp+8 is 16-bit aligned at the entry to main, and therefore so is %rdi when we try to execute 
   movdqa %xmm0, (%rdi)
resulting in segv.


thsalmonj@drdlogin0039.en.desres$ cat e2.c
#include <stdint.h>
#include <emmintrin.h>
#include <string.h>
struct a4x32{
      uint32_t v[4];
};
struct a1xm128i{
      __m128i m;
};
static struct a4x32 zero () {
    struct a1xm128i c1x128;
    struct a4x32 c4x32;
    c1x128.m = _mm_setzero_si128();
    memcpy (&c4x32.v[0], &c1x128.m, sizeof (c4x32));
    return c4x32;
}
struct S {
     struct a4x32 v;
};
void method (struct S * e) {
     e->v = zero ();
}
int main (int argc, char **argv) {
    struct S e;
    method(&e);
    return e.v.v[0];
}
salmonj@drdlogin0039.en.desres$ desres-cleanenv -m gcc/4.6.1-23A/bin gcc -Wall -O -std=c99 -pedantic -S e2.c
salmonj@drdlogin0039.en.desres$ desres-cleanenv -m gcc/4.6.1-23A/bin gcc e2.s
salmonj@drdlogin0039.en.desres$ ./a.out
Segmentation fault (core dumped)
salmonj@drdlogin0039.en.desres$ cat e2.s
	.file	"e2.c"
	.text
	.globl	method
	.type	method, @function
method:
.LFB522:
	.cfi_startproc
	pxor	%xmm0, %xmm0
	movdqa	%xmm0, (%rdi)
	ret
	.cfi_endproc
.LFE522:
	.size	method, .-method
	.globl	main
	.type	main, @function
main:
.LFB523:
	.cfi_startproc
	subq	$16, %rsp
	.cfi_def_cfa_offset 24
	movq	%rsp, %rdi
	call	method
	movl	(%rsp), %eax
	addq	$16, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE523:
	.size	main, .-main
	.ident	"GCC: (GNU) 4.6.1"
	.section	.note.GNU-stack,"",@progbits
salmonj@drdlogin0039.en.desres$
Comment 2 H.J. Lu 2011-09-26 17:24:47 UTC
The problem is the wrong alignment for e->v in method.
The initial RTL has

(insn 17 16 18 4 (set (mem/s/j:V2DI (plus:DI (reg/v/f:DI 66 [ e ]) 
                (const_int 40 [0x28])) [0 MEM[(struct Engine *)e_1(D) + 40B].m+0 S16 A128])
        (reg:V2DI 68)) x.c:36 -1
     (nil))

A128 is wrong.
Comment 3 H.J. Lu 2011-09-26 17:39:41 UTC
-ftree-isa ignores alignment and generates:


  SR.11_17 = SR.10_18;
  MEM[(struct Engine *)e_1(D) + 40B].m = SR.11_17;
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

This is wrong.
Comment 4 H.J. Lu 2011-09-26 20:01:44 UTC
It is caused by revision 164515:

http://gcc.gnu.org/ml/gcc-cvs/2010-09/msg00812.html
Comment 5 rguenther@suse.de 2011-09-27 08:52:06 UTC
On Mon, 26 Sep 2011, hjl.tools at gmail dot com wrote:

> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50444
> 
> H.J. Lu <hjl.tools at gmail dot com> changed:
> 
>            What    |Removed                     |Added
> ----------------------------------------------------------------------------
>                  CC|                            |jamborm at gcc dot gnu.org
>             Summary|unaligned movdqa            |-ftree-isa ignores
>                    |instruction after inlining  |alignment
> 
> --- Comment #3 from H.J. Lu <hjl.tools at gmail dot com> 2011-09-26 17:39:41 UTC ---
> -ftree-isa ignores alignment and generates:
> 
> 
>   SR.11_17 = SR.10_18;
>   MEM[(struct Engine *)e_1(D) + 40B].m = SR.11_17;
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> This is wrong.

Ah, so we now have the SSE testcase ;)
Comment 6 Richard Biener 2011-10-24 07:54:36 UTC
Trunk seems to "work", but surely the issue is latent there.  Regression from 4.5.
Comment 7 Jakub Jelinek 2011-10-26 17:13:39 UTC
GCC 4.6.2 is being released.
Comment 8 Richard Biener 2011-10-27 10:17:30 UTC
Let's do something about this.
Comment 9 Martin Jambor 2011-12-16 19:35:07 UTC
I have just discovered this has not been fixed by the patch to dela with PR 50569. I'm moving this to the top of my todo list now.
Comment 10 Eric Botcazou 2011-12-17 14:54:32 UTC
> I have just discovered this has not been fixed by the patch to dela with PR
> 50569. I'm moving this to the top of my todo list now.

This is expected, PR50569 is a different (sub-)problem.  The similar problem is rather PR51315 (whose fix was restricted to strict-alignment platforms).
Comment 11 Martin Jambor 2012-01-12 13:47:04 UTC
I think that SRA's part of the fix is what I have just posted to the mailing list:
http://gcc.gnu.org/ml/gcc-patches/2012-01/msg00613.html
Comment 12 Richard Biener 2012-01-16 15:25:06 UTC
Testcase w/o includes that fails with 4.6 and 4.7:

typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
typedef int __v4si __attribute__ ((__vector_size__ (16)));
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
typedef unsigned int uint32_t;

typedef struct {
    uint32_t v[4];
} a4x32;

a4x32* incr(a4x32* x)
{
  x->v[0] += 1;
  return x;
}

typedef struct {
    __m128i m;
} a1xm128i;

static inline  a1xm128i ssefunc( a1xm128i in,  a1xm128i k)
{
  a1xm128i ret;
  ret.m = (__m128i)__builtin_ia32_pxor128 ((__v2di)in.m, (__v2di)k.m);
  return ret;
}

static  a4x32  caster( a4x32 c4x32,  a1xm128i k)
{
  a1xm128i c1x128;
  if( sizeof(c4x32) != sizeof(c1x128) ) __builtin_abort();
  __builtin_memcpy(&c1x128, &c4x32, sizeof(c1x128));
  c1x128 = ssefunc(c1x128, k);
  __builtin_memcpy(&c4x32, &c1x128, sizeof(c4x32));
  return c4x32;
}

typedef struct  {
    a1xm128i key;
    a4x32 c;
    __SIZE_TYPE__ elem;
    a4x32 v;
} Engine;

void ctor(Engine *e)
{
  e->elem = 0;
  e->key.m = (__m128i)(__v4si){ 0, 0, 0, 0 };
  e->c.v[0] = 0;
  e->c.v[1] = 0;
  e->c.v[2] = 0;
  e->c.v[3] = 0;
}

uint32_t method( Engine *e)
{
  if( e->elem == 0 )
    {
      e->v = caster(*incr(&e->c), e->key);
      e->elem = 4;
    }
  return e->v.v[--e->elem];
}

int main()
{
  Engine e4; ctor(&e4);
  Engine e5; ctor(&e5);
  if(method(&e4)!=method(&e5))
    __builtin_abort ();
  return 0;
}

and the problematic SRA is indeed happening during ESRA in caster () which
looks like (before SRA):

<bb 2>:
  MEM[(char * {ref-all})&c1x128] = MEM[(char * {ref-all})&c4x32];
  in = c1x128;
  k = k;
  D.1785_7 = k.m;
  D.1784_8 = in.m;
  D.1783_9 = __builtin_ia32_pxor128 (D.1784_8, D.1785_7);
  D.1782.m = D.1783_9;
  D.1780 = D.1782;
  c1x128 = D.1780;
  MEM[(char * {ref-all})&c4x32] = MEM[(char * {ref-all})&c1x128];
  D.1760 = c4x32;
  c1x128 ={v} {CLOBBER};
  return D.1760;

and after SRA:

<bb 2>:
  c4x32$m_4 = MEM[(struct  *)&c4x32].m;
  c1x128$m_14 = c4x32$m_4;
  in$m_13 = c1x128$m_14;
  k$m_12 = MEM[(struct  *)&k].m;
  D.1785_7 = k$m_12;
  D.1784_8 = in$m_13;
  D.1783_9 = __builtin_ia32_pxor128 (D.1784_8, D.1785_7);
  SR.6_11 = D.1783_9;
  SR.7_10 = SR.6_11;
  c1x128$m_2 = SR.7_10;
  c4x32$m_15 = c1x128$m_2;
  MEM[(struct  *)&D.1760].m = c4x32$m_15;
  c1x128$m_16 = { 0, 0 };
  return D.1760;

notice that D.1760 is of type a4x32 and thus has the alignment of an
integer.  But SRA constructs in-place the object of type c1x128.
SRA analysis should have seen the alignment breaking copy

  MEM[(char * {ref-all})&c4x32] = MEM[(char * {ref-all})&c1x128];

which uses a properly aligned type for the store.  Similarly the
prevailing store

  D.1760 = c4x32;

has the alignment of D.1760.

D.1760 already has a bogus type in lacc->type.  We can easily avoid
translating across aggregate copies that would transfer bogusly aligned types
to an access via

Index: tree-sra.c
===================================================================
--- tree-sra.c  (revision 183205)
+++ tree-sra.c  (working copy)
@@ -2290,7 +2290,9 @@ propagate_subaccesses_across_link (struc
 
   if (is_gimple_reg_type (racc->type))
     {
-      if (!lacc->first_child && !racc->first_child)
+      if (!lacc->first_child && !racc->first_child
+         && (get_object_alignment (lacc->expr)
+             >= get_object_alignment (racc->expr)))
        {
          tree t = lacc->base;
 
or make sure to transfer the alignment to a constructed bare(!) MEM_REF
from lacc->expr before overwriting that (assuming it retains the original
form up until here).
Comment 13 Richard Biener 2012-01-16 15:41:50 UTC
Or, transfering the alignment:

Index: tree-sra.c
===================================================================
--- tree-sra.c  (revision 183205)
+++ tree-sra.c  (working copy)
@@ -2294,15 +2294,17 @@ propagate_subaccesses_across_link (struc
        {
          tree t = lacc->base;
 
-         lacc->type = racc->type;
-         if (build_user_friendly_ref_for_offset (&t, TREE_TYPE (t),
-                                                 lacc->offset, racc->type))
+         if (lacc->type == racc->type
+             && build_user_friendly_ref_for_offset (&t, TREE_TYPE (t),
+                                                    lacc->offset, lacc->type))
            lacc->expr = t;
          else
            {
-             lacc->expr = build_ref_for_model (EXPR_LOCATION (lacc->base),
-                                               lacc->base, lacc->offset,
-                                               racc, NULL, false);
+             lacc->type = build_aligned_type (racc->type,
+                                              get_object_alignment (lacc->expr));
+             lacc->expr = build_ref_for_offset (EXPR_LOCATION (lacc->base),
+                                                lacc->base, lacc->offset,
+                                                lacc->type, NULL, false);
              lacc->grp_no_warning = true;
            }
        }

note the build_user_friendly_ref_for_offset which is really bogus as it
would construct component-refs out of thin air.  The above probably
completely disables that path ...

get_object_alignment is a little conservative - we can and need to
do better by mimicking what expand does.  Note that we can't use
build_ref_for_model here.

With the above the testcase passes as well and we still scalarize:

  SR.7_10 = SR.6_11;
  c1x128$m_2 = SR.7_10;
  c4x32_15 = c1x128$m_2;
  MEM[(struct  *)&D.1760] = c4x32_15;
Comment 14 Martin Jambor 2012-01-18 11:23:00 UTC
Created attachment 26362 [details]
patch-in-progress

I talked to richi on IRC yesterday and we agreed that because we rely
on build_ref_for_model to create the whole chain of COMPONENT_REFs on
strict alignment platforms, that we would disable some scalarization
in cases like this, after all.

However, in addition to what richi suggested in comment #12, I found
out that that was not enough to make the test pass on i686.  Also,
propagation across assignments is not the only way how these may be
created for the aggregate on the LHS, they might just be there because
of pre-existing accesses elsewhere in the function.

So I looked at the caster dumps again and found out that indeed the
very first assignment 

  MEM[(char * {ref-all})&c1x128] = MEM[(char * {ref-all})&c4x32];

had the LHS replaced by a SRA vector replacement and the RHS by a
reference created by build_ref_for_model with the same problems.  This
lead to a similar situation but with switched sides.

So I looked at where these build_ref_for_model an came up with the
following patch which is fairly straightforward and makes the testcase
pass also on i686.  However, there are two problems with the patch.

1. This is the easy part.  I'm afraid similar problems are lurking in
processing aggregate copies with children accesses.  But those could
be dealt with afterwards.

2. When I bootstrapped gcc and run the testsuite with this patch, I
got a new failure in libstdc++:

  23_containers/forward_list/modifiers/1.cc

The failure persists with -fno-tree-sra, so something went wrong when
compiling the library.  The patch is very simple and only disabling
stuff which then has to through the VIEW_CONVERT_EXPR path so at the
moment I think that either the condition guarding the following

  force_gimple_rhs = true;

is wrong or I have uncovered some other bug :-(
Comment 15 Richard Biener 2012-01-20 14:16:25 UTC
Created attachment 26395 [details]
other candidate patch

I'm testing the following patch instead, which avoids changing access types
for all-scalar across-link propagations (we're going to create proper V_C_Es
later).  I also remove the fancy code that tries to avoid adding V_C_Es,
it looks it will cause more trouble than missed-optimizations.

That way we completely avoid needing to care for alignment at that particular
places.  Whether the aggregate copy across-link propagation is affected in
a similar way remains to be seen.

I'll see if I run into the same issue as you and investigate that.
Comment 16 Richard Biener 2012-01-20 15:46:40 UTC
(In reply to comment #15)
> Created attachment 26395 [details]
> other candidate patch
> 
> I'm testing the following patch instead, which avoids changing access types
> for all-scalar across-link propagations (we're going to create proper V_C_Es
> later).  I also remove the fancy code that tries to avoid adding V_C_Es,
> it looks it will cause more trouble than missed-optimizations.
> 
> That way we completely avoid needing to care for alignment at that particular
> places.  Whether the aggregate copy across-link propagation is affected in
> a similar way remains to be seen.
> 
> I'll see if I run into the same issue as you and investigate that.

gcc.dg/torture/pr47228.c shows that we rely on the build-ref-for-model
path in sra_modify_assign as we scalarize

struct S4
{
  unsigned f0:24;
} __attribute__((__packed__));

to unsigned int : 24, which is of different size, so we refuse to
VIEW_CONVERT the SImode register to BLKmode.  I'm not entirely sure
what's the best cause of action here, but certainly either detecting
the size-mismatch issue at analysis phase or papering over the issue
with build-ref-for-model (which might not always suceed?!).

Other FAILs this patch causes are

Running target unix/
FAIL: gcc.dg/tree-ssa/pr45144.c scan-tree-dump optimized " = VIEW_CONVERT_EXPR<u
nsigned int>\\(a\\);"

Running target unix/-m32
FAIL: gcc.dg/torture/pr45678-2.c  -Os  execution test
FAIL: gcc.dg/tree-ssa/pr45144.c scan-tree-dump optimized " = VIEW_CONVERT_EXPR<u
nsigned int>\\(a\\);"

Running target unix/
FAIL: 20_util/hash/chi2_quality.cc execution test
FAIL: 23_containers/forward_list/capacity/resize_size.cc execution test
FAIL: 23_containers/forward_list/modifiers/2.cc execution test
FAIL: 23_containers/list/operations/3.cc execution test
FAIL: 23_containers/list/operations/3_c++0x.cc execution test
FAIL: ext/pb_ds/example/basic_map.cc execution test
FAIL: ext/pb_ds/example/basic_multiset.cc execution test
FAIL: ext/pb_ds/example/basic_set.cc execution test
FAIL: ext/pb_ds/example/erase_if.cc execution test
FAIL: ext/pb_ds/example/tree_intervals.cc execution test
FAIL: ext/pb_ds/example/tree_join.cc execution test
FAIL: ext/pb_ds/example/tree_order_statistics.cc execution test
FAIL: ext/pb_ds/regression/associative_containers.cc execution test
FAIL: ext/pb_ds/regression/tree_map_rand.cc execution test
FAIL: ext/pb_ds/regression/tree_set_rand.cc execution test

Running target unix//-m32
FAIL: 23_containers/list/operations/3_c++0x.cc execution test
FAIL: 25_algorithms/nth_element/2.cc execution test

I'm going to test the two parts of the patch separately now.
Comment 17 Richard Biener 2012-01-27 14:54:44 UTC
Author: rguenth
Date: Fri Jan 27 14:54:37 2012
New Revision: 183629

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=183629
Log:
2012-01-27  Richard Guenther  <rguenther@suse.de>

	PR tree-optimization/50444
	* expr.c (mem_ref_refers_to_non_mem_p): New function.
	(expand_assignment): Use it.  Properly handle misaligned
	bases when expanding stores to component references.
	(expand_expr_real_1): Use mem_ref_refers_to_non_mem_p and
	refactor that case.

Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/expr.c
Comment 18 Richard Biener 2012-01-27 14:57:04 UTC
Author: rguenth
Date: Fri Jan 27 14:56:54 2012
New Revision: 183630

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=183630
Log:
2012-01-27  Richard Guenther  <rguenther@suse.de>

	PR tree-optimization/50444
	* tree-sra.c (build_ref_for_offset): Properly adjust the
	MEM_REF type for unaligned accesses.

	* gcc.dg/torture/pr50444.c: New testcase.

Added:
    trunk/gcc/testsuite/gcc.dg/torture/pr50444.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/tree-sra.c
Comment 19 Richard Biener 2012-01-27 15:00:49 UTC
Fixed on trunk.
Comment 20 Jakub Jelinek 2012-03-01 14:38:33 UTC
GCC 4.6.3 is being released.
Comment 21 Jakub Jelinek 2013-04-12 16:17:41 UTC
The 4.6 branch has been closed, fixed in GCC 4.7.0.