After upgrading to 4.4.1 one of my unit tests started failing reproducibly. I tried to find the part of -O3 (compared to -O2) that caused the failure and I was able to "fix" the error by changing "optimize >= 3" to "optimize >= 4" in tree-ssa-loop.c lines 509 and 550. That's the minimal change to make it work again. Next I tried to reduce my code to a simple testcase but that is really hard. I will attach the smallest file I was able to reduce. But in that case it also fails at -O3 (with my change above -O4 was necessary to make it fail), -O2 and -O1. -O0 still doesn't fail. Any further simplification of the code made the bug go away. The code creates an array of floats which an index vector (starting from {0, 1}) gathers values from into a float vector class. Then the code verifies that the correct values were read. The error I was able to identify in the assembly is that the multiplication at line 93 ({0, 1} * uint_v(1)) is carried out as {0, 1} * {2, 3} instead of {0, 1} * {1, 1}.
Created attachment 18278 [details] minimal testcase
Confirmed. Trunk works for me. If I disable gcse the testcase is no longer miscompiled (likewise if I disable tree PRE which only hoists some conversions - thus likely const/copy propagation goes wrong or addresses are improperly rewritten). The RTL after expansion looks sane wrt alias sets.
This has been broken even on the trunk, since r146669 till r146847 (so it works again with r146848), no idea whether it is the same thing or not, nor whether r146848 really fixed it rather than just made it latent (and similarly whether r146669 broke it or just made it real rather than latent).
On the 4.4 branch/x86_64-linux, I can reproduce even with -O1 on: extern "C" void abort (void); #define _MAY_ALIAS __attribute__((__may_alias__)) typedef struct { float v[2]; } _float_v; typedef struct { int v[2]; } _int_v; typedef unsigned int _UInt _MAY_ALIAS; typedef signed int _Int _MAY_ALIAS; typedef float _Float _MAY_ALIAS; static inline unsigned short less_than (_int_v a, _int_v b) { unsigned short r = 0; const _UInt *p1 = (const _UInt *) &a; const _UInt *p2 = (const _UInt *) &b; for (int i=0; i < 2; i++) if (p1[i] < p2[i]) r |= (1 << i); return r; } static inline _int_v multiply (_int_v b, _int_v c) { _int_v r; _Int *p3 = (_Int *) &c; for (int i=0; i < 2; i++) r.v[i] = (int) (b.v[i] * p3[i] & 0xFFFFFFFF); return r; } static inline _float_v gather (_int_v indexes, const void *baseAddr) { _float_v r; _Int *idx = (_Int *) &indexes; _Float *src = (_Float *) baseAddr; for (int i=0; i < 2; i++) r.v[i] = *(src + idx[i]); return r; } static inline _int_v add (const _int_v &b, const _int_v &c) { _int_v result; _Int *r = (_Int *) &result; for (int i=0; i < 2; i++) r[i] = b.v[i] + c.v[i]; return result; } struct uint_v { _int_v data; inline uint_v () { data.v[0] = 0; data.v[1] = 1; } inline uint_v (unsigned int a) { for (int i=0; i < 2; i++) *(_UInt *) &data.v[i] = a; } inline uint_v (_int_v x) : data (x) {} inline uint_v operator* (const uint_v &x) const { return multiply (data, x.data); } inline uint_v operator+ (const uint_v &x) const { return uint_v (add (data, x.data)); } inline unsigned short operator< (const uint_v &x) const { return less_than (data, x.data); } }; struct float_v { _float_v data; explicit inline float_v (const uint_v &x) { _UInt *p2 = (_UInt *) &x.data; for (int i=0; i < 2; i++) data.v[i] = p2[i]; } inline float_v (const float *array, const uint_v &indexes) { const uint_v &offsets = indexes * uint_v (1); data = gather (offsets.data, array); } __attribute__((noinline)) unsigned short operator== (const float_v &x) const { unsigned short r = 0; for (int i=0; i < 2; i++) if (data.v[i] == x.data.v[i]) r |= (1 << i); return r; } }; int main () { const float array[2] = { 2, 3 }; unsigned short mask; for (uint_v i; (mask = (i < 2)) == 3; i = i + 2) { const float_v ii (i + 2); asm volatile ("# Barrier 1 %0 %1 %2" : : "r" (&ii), "r" (&i), "r" (array) : "memory"); float_v a (array, i); asm volatile ("# Barrier 2 %0 %1 %2 %3" : : "r" (&ii), "r" (&i), "r" (array), "r" (&a) : "memory"); if ((a == ii) != 3) abort(); } return 0; } Apparently there is some stack slot sharing and RTL DSE decides to remove a store to stack related address as dead when that memory is later on read using a non-%rsp related read.
Patch posted: http://gcc.gnu.org/ml/gcc-patches/2009-08/msg00226.html
Subject: Bug 40924 Author: jakub Date: Wed Aug 5 12:36:34 2009 New Revision: 150483 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=150483 Log: PR rtl-optimization/40924 * dse.c (canon_address): Before calling cselib_expand_value_rtx make sure canon_rtx (mem_address) isn't simpler than canon_rtx (expanded_mem_address). * g++.dg/torture/pr40924.C: New test. Added: trunk/gcc/testsuite/g++.dg/torture/pr40924.C Modified: trunk/gcc/ChangeLog trunk/gcc/dse.c trunk/gcc/testsuite/ChangeLog
Subject: Bug 40924 Author: jakub Date: Wed Aug 5 12:54:21 2009 New Revision: 150484 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=150484 Log: PR rtl-optimization/40924 * dse.c (canon_address): Before calling cselib_expand_value_rtx make sure canon_rtx (mem_address) isn't simpler than canon_rtx (expanded_mem_address). * g++.dg/torture/pr40924.C: New test. Added: branches/gcc-4_4-branch/gcc/testsuite/g++.dg/torture/pr40924.C Modified: branches/gcc-4_4-branch/gcc/ChangeLog branches/gcc-4_4-branch/gcc/dse.c branches/gcc-4_4-branch/gcc/testsuite/ChangeLog
Fixed.