/* gcc-4.6.0 optimization regression on x86_64-unknown-linux-gnu: % gcc -O1 -funroll-loops -o foo foo.c foo order= 11 % % gcc -O2 -funroll-loops -o foo foo.c foo # hangs make: *** [bad] Interrupt % % gcc-4.5.1 -O2 -funroll-loops -o foo foo.c foo order=11 % % gcc -v Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/usr/local/gcc-4.6.0/x86_64-Linux-core2-fc/libexec/gcc/x86_64-unknown-linux-gnu/4.6.0/lto-wrapper Target: x86_64-unknown-linux-gnu Configured with: /usr/local/gcc-4.6.0/src/gcc-4.6.0/configure --enable-languages=c,c++,fortran --with-gnu-as --with-gnu-as=/usr/local/binutils-2.21/x86_64-Linux-core2-fc-gcc-4.5.1-rh/bin/as --with-gnu-ld --with-ld=/usr/local/binutils-2.21/x86_64-Linux-core2-fc-gcc-4.5.1-rh/bin/ld --with-gmp=/usr/local/mpir-2.3.0/x86_64-Linux-core2-fc-gcc-4.5.1-rh --with-mpfr=/usr/local/mpfr-3.0.0/x86_64-Linux-core2-fc-mpir-2.3.0-gcc-4.5.1-rh --with-mpc=/usr/local/mpc-0.9/x86_64-Linux-core2-fc-mpir-2.3.0-mpfr-3.0.0-gcc-4.5.1-rh --prefix=/usr/local/gcc-4.6.0/x86_64-Linux-core2-fc Thread model: posix gcc version 4.6.0 (GCC) % Apologies that this code is so long, but I can not find any way to shorten it further and still demonstrate the bug. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #define SIZE 12 unsigned long int ss[SIZE][2]; #define SET_BIT_MASK(x) ((unsigned long)1<<(x)) #define SET_ELEMENT_CONTAINS(e,v) ((e)&SET_BIT_MASK(v)) #define SET_CONTAINS_FAST(s,a) (SET_ELEMENT_CONTAINS((s)[0], (a))) #define SET_CONTAINS(s,a) (((a)<SET_MAX_SIZE(s))?SET_CONTAINS_FAST(s,a):0) #define SET_MAX_SIZE(s) ((s)[-1]) typedef struct graph graph_t; struct graph { int n; unsigned long *edges[SIZE]; } gg; #define GRAPH_IS_EDGE(g,i,j) \ (((j)<(((g)->edges[(0)]))[-1])?SET_CONTAINS_FAST((g)->edges[(i)],j):0) /* SET_CONTAINS((g)->edges[(i)], (j)) */ int bar(graph_t *g) { int i,j,v; int tmp_used[SIZE]; int degree[SIZE]; int order[SIZE]; int maxdegree,maxvertex=0; int samecolor; for (i = 0; i < SIZE; i++) { order[i] = 0; degree[i] = 0; } for (i=0; i < g->n; i++) { for (j=0; j < g->n; j++) { if ((i==j) && GRAPH_IS_EDGE(g,i,j)) { exit(0);; } if (GRAPH_IS_EDGE(g,i,j)) degree[i]++; } } v=0; while (v < SIZE) { memset(tmp_used,0,SIZE * sizeof(int)); do { maxdegree=0; samecolor=0; for (i=0; i < SIZE; i++) { if (!tmp_used[i] && degree[i] >= maxdegree) { maxvertex=i; maxdegree=degree[i]; samecolor=1; } } if (samecolor) { order[v]=maxvertex; degree[maxvertex]=-1; v++; for (i=0; i < SIZE; i++) { if (GRAPH_IS_EDGE(g,maxvertex,i)) { tmp_used[i]=1; degree[i]--; } } } } while (samecolor); } return order[0]; } graph_t *make_graph() { graph_t *g; int i; for (i=0; i < SIZE; i++) { ss[i][0] = SIZE; } ss[0][1] = 2114; ss[1][1] = 37; ss[2][1] = 1034; ss[3][1] = 532; ss[4][1] = 296; ss[5][1] = 82; ss[6][1] = 161; ss[7][1] = 2368; ss[8][1] = 656; ss[9][1] = 1288; ss[10][1] = 2564; ss[11][1] = 1153; g = ≫ g->n=SIZE; for (i=0; i < SIZE; i++) { g->edges[i]=&(ss[i][1]); } return g; } int main(int argc, char **argv) { graph_t *g; int order; g = make_graph(); order = bar(g); printf("order= %d\n", order); return 0; }
Confirmed. -fno-ivopts works around the issue (I didn't yet investigate whether it causes the issue).
It is caused by revision 162653: http://gcc.gnu.org/ml/gcc-cvs/2010-07/msg01007.html
(In reply to comment #2) > It is caused by revision 162653: > > http://gcc.gnu.org/ml/gcc-cvs/2010-07/msg01007.html Looked at IVOPT transformation -- seems ok. The program passes with the following options: -O2 -funroll-loops regression.c -fno-tree-vrp -fno-tree-dominator-opts -fno-gcse Removing any of the -fno-xxx options, the program fail. -fno-gcse makes difference indicates tree level transformations are fine -- possibly bad aliasing? David
The code is fine with -fno-strict-aliasing, so I doubt that. Just -fno-gcse also doesn't fix it (nor does -fno-schedule-insns2, the usual RTL alias related miscompiler).
Slightly reduced testcase, fails with -O2 -funroll-loops on x86_64-linux, succeeds with -O2: unsigned long int s[12][2] = { { 12, 2114 }, { 12, 37 }, { 12, 1034 }, { 12, 532 }, { 12, 296 }, { 12, 82 }, { 12, 161 }, { 12, 2368 }, { 12, 656 }, { 12, 1288 }, { 12, 2564 }, { 12, 1153 } }; struct { int n; unsigned long *edges[12]; } g = { 12, { &s[0][1], &s[1][1], &s[2][1], &s[3][1], &s[4][1], &s[5][1], &s[6][1], &s[7][1], &s[8][1], &s[9][1], &s[10][1], &s[11][1] } }; #define SET_BIT_MASK(x) ((unsigned long)1<<(x)) #define SET_ELEMENT_CONTAINS(e,v) ((e)&SET_BIT_MASK(v)) #define SET_CONTAINS_FAST(s,a) (SET_ELEMENT_CONTAINS((s)[0], (a))) #define GRAPH_IS_EDGE(g,i,j) \ (((j)<(((g)->edges[(0)]))[-1])?SET_CONTAINS_FAST((g)->edges[(i)],j):0) int main () { int i, j, v, a[12], c[12], e = 0; for (i = 0; i < 12; i++) c[i] = 0; for (i = 0; i < g.n; i++) for (j = 0; j < g.n; j++) { if (i == j && GRAPH_IS_EDGE (&g, i, j)) __builtin_exit (0); if (GRAPH_IS_EDGE (&g, i, j)) c[i]++; } for (i = 0; i < 12; i++) if (c[i] != 3) __builtin_abort (); for (v = 0; v < 2; v++) { __builtin_memset (a, 0, 12 * sizeof (int)); for (i = 0; i < 12; i++) if (a[i]) e = i; v++; } return e; }
Tiny bit more simplified, without the GRAPH_IS_EDGE and related macros: unsigned long int s[12][2] = { { 12, 2114 }, { 12, 37 }, { 12, 1034 }, { 12, 532 }, { 12, 296 }, { 12, 82 }, { 12, 161 }, { 12, 2368 }, { 12, 656 }, { 12, 1288 }, { 12, 2564 }, { 12, 1153 } }; struct { int n; unsigned long *edges[12]; } g = { 12, { &s[0][1], &s[1][1], &s[2][1], &s[3][1], &s[4][1], &s[5][1], &s[6][1], &s[7][1], &s[8][1], &s[9][1], &s[10][1], &s[11][1] } }; int main () { int i, j, v, a[12], c[12], e = 0; for (i = 0; i < 12; i++) c[i] = 0; for (i = 0; i < g.n; i++) for (j = 0; j < g.n; j++) { if (i == j && j < g.edges[0][-1] && (g.edges[i][0] & (1UL << j))) __builtin_exit (0); if (j < g.edges[0][-1] && (g.edges[i][0] & (1UL << j))) c[i]++; } for (i = 0; i < 12; i++) if (c[i] != 3) __builtin_abort (); for (v = 0; v < 2; v++) { __builtin_memset (a, 0, 12 * sizeof (int)); for (i = 0; i < 12; i++) if (a[i]) e = i; } return e; }
unsigned long int s[12][2] = { { 12, ~1 }, { 12, ~2 }, { 12, ~4 }, { 12, ~8 }, { 12, ~16 }, { 12, ~32 }, { 12, ~64 }, { 12, ~128 }, { 12, ~256 }, { 12, ~512 }, { 12, ~1024 }, { 12, ~2048 } }; struct { int n; unsigned long *e[12]; } g = { 12, { &s[0][1], &s[1][1], &s[2][1], &s[3][1], &s[4][1], &s[5][1], &s[6][1], &s[7][1], &s[8][1], &s[9][1], &s[10][1], &s[11][1] } }; int main () { int i, j, c[12]; for (i = 0; i < 12; i++) c[i] = 0; for (i = 0; i < g.n; i++) for (j = 0; j < g.n; j++) { if (i == j && j < g.e[0][-1] && (g.e[i][0] & (1UL << j))) __builtin_exit (0); if (j < g.e[0][-1] && (g.e[i][0] & (1UL << j))) c[i]++; } for (i = 0; i < 12; i++) if (c[i] != 11) __builtin_abort (); return 0; } Slightly more reduced. This one shows clearly on which iteration of the inner unrolled loop there is some problem, as c during abort is { 10, 11, 10 ... }, which means & 1 testing is performed, & 2 is wrong and & 4 and higher works too.
/* PR rtl-optimization/48774 */ /* { dg-do run } */ /* { dg-options "-O2 -funroll-loops" } */ extern void abort (void); unsigned long int s[24] = { 12, ~1, 12, ~2, 12, ~4, 12, ~8, 12, ~16, 12, ~32, 12, ~64, 12, ~128, 12, ~256, 12, ~512, 12, ~1024, 12, ~2048 }; struct { int n; unsigned long *e[12]; } g = { 12, { &s[0], &s[2], &s[4], &s[6], &s[8], &s[10], &s[12], &s[14], &s[16], &s[18], &s[20], &s[22] } }; int c[12]; __attribute__((noinline, noclone)) void foo (void) { int i, j; for (i = 0; i < g.n; i++) for (j = 0; j < g.n; j++) { if (i == j && j < g.e[0][0] && (g.e[i][1] & (1UL << j))) abort (); if (j < g.e[0][0] && (g.e[i][1] & (1UL << j))) c[i]++; } } int main () { int i; asm volatile ("" : "+m" (s), "+m" (g), "+m" (c)); foo (); for (i = 0; i < 12; i++) if (c[i] != 11) abort (); return 0; } Apparently the [0][-1] didn't matter, and this testcase also decreases the size of the miscompiled routine.
This looks like a target bug to me in *.postreload we have: (insn 615 177 616 32 (set (reg:CCC 17 flags) (compare:CCC (zero_extract:DI (reg:DI 5 di [orig:129 prephitmp.8 ] [129]) (const_int 1 [0x1]) (const_int 1 [0x1])) (const_int 0 [0]))) pr48774-6.c:23 377 {*testqi_ext_3_rex64} (nil)) (jump_insn 616 615 184 32 (set (pc) (if_then_else (ne (reg:CCC 17 flags) (const_int 0 [0])) (label_ref 214) (pc))) pr48774-6.c:23 589 {*jcc_1} (expr_list:REG_BR_PROB (const_int 5000 [0x1388]) (nil)) which comes from jcc_bt* splitter, where the second const1_rtx still was a register at the time of the split, but IRA materialized it into a constant. The CCC mode is fine for bt insn. Unfortunately split2 pass splits this into: (insn 626 177 616 32 (set (reg:CCC 17 flags) (compare:CCC (and:QI (reg:QI 5 di [orig:129 prephitmp.8 ] [129]) (const_int 2 [0x2])) (const_int 0 [0]))) pr48774-6.c:23 369 {*testqi_1_maybe_si} (nil)) (jump_insn 616 626 184 32 (set (pc) (if_then_else (ne (reg:CCC 17 flags) (const_int 0 [0])) (label_ref 214) (pc))) pr48774-6.c:23 589 {*jcc_1} (expr_list:REG_BR_PROB (const_int 5000 [0x1388]) (nil)) -> 214) which is already incorrect, if we want to replace bt-ish test, we'd need to update the mode to CCmode or similar and update the user(s). The generated assembly then has: andl $2, %edi jnc .L21 whereas it should have been either btl $1, %edi jnc .L21 or andl $2, %edi je .L21
Created attachment 24169 [details] gcc47-pr48774.patch Untested fix. The additional condition could be changed to just CCCmode check, or on the other side have: || !(TARGET_USE_BT || optimize_function_for_size_p (cfun)) in as well. Or *bt<mode> would need to be represented in RTL in some different way, where the setting of Carry is natural to the operation and couldn't be confused with testqi.
Author: jakub Date: Tue May 3 13:01:12 2011 New Revision: 173301 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=173301 Log: PR target/48774 * config/i386/i386.c (ix86_match_ccmode): For CC{A,C,O,S}mode only succeed if req_mode is the same as set_mode. * gcc.dg/pr48774.c: New test. Added: trunk/gcc/testsuite/gcc.dg/pr48774.c Modified: trunk/gcc/ChangeLog trunk/gcc/config/i386/i386.c trunk/gcc/testsuite/ChangeLog
Author: jakub Date: Tue May 3 13:06:06 2011 New Revision: 173302 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=173302 Log: PR target/48774 * config/i386/i386.c (ix86_match_ccmode): For CC{A,C,O,S}mode only succeed if req_mode is the same as set_mode. * gcc.dg/pr48774.c: New test. Added: branches/gcc-4_6-branch/gcc/testsuite/gcc.dg/pr48774.c Modified: branches/gcc-4_6-branch/gcc/ChangeLog branches/gcc-4_6-branch/gcc/config/i386/i386.c branches/gcc-4_6-branch/gcc/testsuite/ChangeLog
Fixed.
Author: jakub Date: Tue May 3 16:38:25 2011 New Revision: 173329 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=173329 Log: PR target/48774 * config/i386/i386.c (ix86_match_ccmode): For CC{A,C,O,S}mode only succeed if req_mode is the same as set_mode. * gcc.dg/pr48774.c: New test. Added: branches/gcc-4_5-branch/gcc/testsuite/gcc.dg/pr48774.c Modified: branches/gcc-4_5-branch/gcc/ChangeLog branches/gcc-4_5-branch/gcc/config/i386/i386.c branches/gcc-4_5-branch/gcc/testsuite/ChangeLog
Author: jakub Date: Wed May 4 09:21:09 2011 New Revision: 173359 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=173359 Log: Backported from mainline 2011-05-03 Uros Bizjak <ubizjak@gmail.com> Jakub Jelinek <jakub@redhat.com> PR target/48774 * config/i386/i386.c (ix86_match_ccmode): For CC{A,C,O,S}mode only succeed if req_mode is the same as set_mode. * gcc.dg/pr48774.c: New test. Added: branches/gcc-4_4-branch/gcc/testsuite/gcc.dg/pr48774.c Modified: branches/gcc-4_4-branch/gcc/ChangeLog branches/gcc-4_4-branch/gcc/config/i386/i386.c branches/gcc-4_4-branch/gcc/testsuite/ChangeLog