typedef __SIZE_TYPE__ size_t; static int mymemcmp1 (unsigned long int, unsigned long int) __attribute__ ((__nothrow__)); __inline static int mymemcmp1 (unsigned long int a, unsigned long int b) { long int srcp1 = (long int) &a; long int srcp2 = (long int) &b; unsigned long int a0, b0; do { a0 = ((unsigned char *) srcp1)[0]; b0 = ((unsigned char *) srcp2)[0]; srcp1 += 1; srcp2 += 1; } while (a0 == b0); return a0 - b0; } static int mymemcmp2 (long, long, size_t) __attribute__ ((__nothrow__)); static int mymemcmp2 (long int srcp1, long int srcp2, size_t len) { unsigned long int a0, a1; unsigned long int b0, b1; switch (len % 4) { default: case 2: a0 = ((unsigned long int *) srcp1)[0]; b0 = ((unsigned long int *) srcp2)[0]; srcp1 -= 2 * (sizeof (unsigned long int)); srcp2 -= 2 * (sizeof (unsigned long int)); len += 2; goto do1; case 3: a1 = ((unsigned long int *) srcp1)[0]; b1 = ((unsigned long int *) srcp2)[0]; srcp1 -= (sizeof (unsigned long int)); srcp2 -= (sizeof (unsigned long int)); len += 1; goto do2; case 0: if (16 <= 3 * (sizeof (unsigned long int)) && len == 0) return 0; a0 = ((unsigned long int *) srcp1)[0]; b0 = ((unsigned long int *) srcp2)[0]; goto do3; case 1: a1 = ((unsigned long int *) srcp1)[0]; b1 = ((unsigned long int *) srcp2)[0]; srcp1 += (sizeof (unsigned long int)); srcp2 += (sizeof (unsigned long int)); len -= 1; if (16 <= 3 * (sizeof (unsigned long int)) && len == 0) goto do0; } do { a0 = ((unsigned long int *) srcp1)[0]; b0 = ((unsigned long int *) srcp2)[0]; if (a1 != b1) return mymemcmp1 ((a1), (b1)); do3: a1 = ((unsigned long int *) srcp1)[1]; b1 = ((unsigned long int *) srcp2)[1]; if (a0 != b0) return mymemcmp1 ((a0), (b0)); do2: a0 = ((unsigned long int *) srcp1)[2]; b0 = ((unsigned long int *) srcp2)[2]; if (a1 != b1) return mymemcmp1 ((a1), (b1)); do1: a1 = ((unsigned long int *) srcp1)[3]; b1 = ((unsigned long int *) srcp2)[3]; if (a0 != b0) return mymemcmp1 ((a0), (b0)); srcp1 += 4 * (sizeof (unsigned long int)); srcp2 += 4 * (sizeof (unsigned long int)); len -= 4; } while (len != 0); do0: if (a1 != b1) return mymemcmp1 ((a1), (b1)); return 0; } static int mymemcmp3 (long, long, size_t) __attribute__ ((__nothrow__)); static int mymemcmp3 (long int srcp1, long int srcp2, size_t len) { unsigned long int a0, a1, a2, a3; unsigned long int b0, b1, b2, b3; unsigned long int x; int shl, shr; shl = 8 * (srcp1 % (sizeof (unsigned long int))); shr = 8 * (sizeof (unsigned long int)) - shl; srcp1 &= -(sizeof (unsigned long int)); switch (len % 4) { default: case 2: a1 = ((unsigned long int *) srcp1)[0]; a2 = ((unsigned long int *) srcp1)[1]; b2 = ((unsigned long int *) srcp2)[0]; srcp1 -= 1 * (sizeof (unsigned long int)); srcp2 -= 2 * (sizeof (unsigned long int)); len += 2; goto do1; case 3: a0 = ((unsigned long int *) srcp1)[0]; a1 = ((unsigned long int *) srcp1)[1]; b1 = ((unsigned long int *) srcp2)[0]; srcp2 -= 1 * (sizeof (unsigned long int)); len += 1; goto do2; case 0: if (16 <= 3 * (sizeof (unsigned long int)) && len == 0) return 0; a3 = ((unsigned long int *) srcp1)[0]; a0 = ((unsigned long int *) srcp1)[1]; b0 = ((unsigned long int *) srcp2)[0]; srcp1 += 1 * (sizeof (unsigned long int)); goto do3; case 1: a2 = ((unsigned long int *) srcp1)[0]; a3 = ((unsigned long int *) srcp1)[1]; b3 = ((unsigned long int *) srcp2)[0]; srcp1 += 2 * (sizeof (unsigned long int)); srcp2 += 1 * (sizeof (unsigned long int)); len -= 1; if (16 <= 3 * (sizeof (unsigned long int)) && len == 0) goto do0; } do { a0 = ((unsigned long int *) srcp1)[0]; b0 = ((unsigned long int *) srcp2)[0]; x = (((a2) >> (shl)) | ((a3) << (shr))); if (x != b3) return mymemcmp1 ((x), (b3)); do3: a1 = ((unsigned long int *) srcp1)[1]; b1 = ((unsigned long int *) srcp2)[1]; x = (((a3) >> (shl)) | ((a0) << (shr))); if (x != b0) return mymemcmp1 ((x), (b0)); do2: a2 = ((unsigned long int *) srcp1)[2]; b2 = ((unsigned long int *) srcp2)[2]; x = (((a0) >> (shl)) | ((a1) << (shr))); if (x != b1) return mymemcmp1 ((x), (b1)); do1: a3 = ((unsigned long int *) srcp1)[3]; b3 = ((unsigned long int *) srcp2)[3]; x = (((a1) >> (shl)) | ((a2) << (shr))); if (x != b2) return mymemcmp1 ((x), (b2)); srcp1 += 4 * (sizeof (unsigned long int)); srcp2 += 4 * (sizeof (unsigned long int)); len -= 4; } while (len != 0); do0: x = (((a2) >> (shl)) | ((a3) << (shr))); if (x != b3) return mymemcmp1 ((x), (b3)); return 0; } __attribute__ ((noinline)) int mymemcmp (const void *s1, const void *s2, size_t len) { unsigned long int a0; unsigned long int b0; long int srcp1 = (long int) s1; long int srcp2 = (long int) s2; if (srcp1 % (sizeof (unsigned long int)) == 0) return mymemcmp2 (srcp1, srcp2, len / (sizeof (unsigned long int))); else return mymemcmp3 (srcp1, srcp2, len / (sizeof (unsigned long int))); } char buf[256] __attribute__((aligned (16))); char buf2[256] __attribute__((aligned (16))); int main (void) { __builtin_memcpy (buf + 9, "\x1\x37\x82\xa7\x55\x49\x9d\xbf\xf8\x44\xb6\x55\x17\x8e\xf9", 15); __builtin_memcpy (buf2 + 24, "\x1\x37\x82\xa7\x55\x49\xd0\xf3\xb7\x2a\x6d\x23\x71\x49\x6a", 15); if (mymemcmp (buf + 9, buf2 + 24, 33) != -51) __builtin_abort (); return 0; } is miscompiled at -O2 or -O2 -fno-strict-aliasing, doesn't return the expected -51, but something that depends on the stack content etc. (e.g. prints different value when run from gdb's inferior and in the program). Note mymemcmp has a couple of lines that weren't affecting the result removed.
-fno-tree-pre fixes this.
Confirmed. PRE doesn't do anything interesting though, neither are the differences in the optimized dump interesting from a first look.
In assembly, it seems the a and b arguments to mymemcmp1 aren't stored into memory at all: .loc 1 128 0 movq (%rax), %r12 #* srcp2.272, b0 .LVL54: .loc 1 126 0 movq (%rdi), %rsi #* srcp1, a3 .LVL55: .loc 1 127 0 movq 8(%rdi), %rbx #, a0 .LVL56: .loc 1 129 0 addq $8, %rdi #, srcp1.270 .LVL57: .L28: .loc 1 152 0 movl %r9d, %ecx #, .LVL58: movq %rbx, %r10 # a0, tmp154 .loc 1 149 0 movq 8(%rdi), %r11 #, a1 .LVL59: .loc 1 152 0 salq %cl, %r10 #, tmp154 movl %r8d, %ecx #, .loc 1 150 0 movq 8(%rax), %rbp #, b1 .LVL60: .loc 1 152 0 shrq %cl, %rsi #, tmp155 .LVL61: orq %rsi, %r10 # tmp155, tmp156 cmpq %r12, %r10 # b0, tmp156 je .L26 #, .LBB112: .LBB113: .loc 1 8 0 leaq -8(%rsp), %rcx #, srcp1 .LVL62: .loc 1 9 0 leaq -16(%rsp), %rdx #, srcp2 .LVL63: .p2align 4,,10 .p2align 3 .L31: .loc 1 13 0 movzbl (%rcx), %eax #* srcp1, a0 .LVL64: .loc 1 14 0 movzbl (%rdx), %edi #* srcp2, b0 ... At je .L26 insn %r12 register contains correct value of b0 variable and %r10 of x (shifted + ored together value). But then suddenly the code assumes it is in memory without actually storing it there.
*.copyprop4 still contains the needed stores: [gg5.c : 151] D.2147_293 = a3D.2068_292 >> shlD.2074_195; [gg5.c : 151] D.2148_295 = a0D.2065_294 << shrD.2075_198; [gg5.c : 151] xD.2073_296 = D.2148_295 | D.2147_293; [gg5.c : 152] if (xD.2073_296 != b0D.2069_297) goto <bb 51>; else goto <bb 55> (do2); # SUCC: 51 [72.0%] (true,exec) 55 [28.0%] (irreducible,false,exec) # BLOCK 51 freq:370, starting at line 0 # PRED: 50 [72.0%] (true,exec) # aD.2099_621 = VDEF <aD.2099_620(D)> { aD.2099 } aD.2099 = xD.2073_296; # bD.2098_623 = VDEF <bD.2098_622(D)> { bD.2098 } bD.2098 = b0D.2069_297; [gg5.c : 8] srcp1D.2100_298 = (long intD.2) [gg5.c : 8] &aD.2099; [gg5.c : 9] srcp2D.2101_299 = (long intD.2) [gg5.c : 9] &bD.2098; # BLOCK 52 freq:2640 # PRED: 51 [100.0%] (fallthru,exec) 53 [100.0%] (fallthru,exec) # srcp1D.2100_300 = PHI <srcp1D.2100_298(51), srcp1D.2100_308(53)> # srcp2D.2101_304 = PHI <srcp2D.2101_299(51), srcp2D.2101_309(53)> [gg5.c : 13] srcp1.0D.2149_301 = (unsigned charD.10 *) srcp1D.2100_300; [gg5.c : 13] # VUSE <aD.1965_413(D), bD.1970_414(D), aD.1971_415(D), bD.1976_416(D), aD.1977_417(D), bD.1982_418(D), aD.1983_41 9(D), SMT.25D.2200_420(D)> { aD.1965 bD.1970 aD.1971 bD.1976 aD.1977 bD.1982 aD.1983 SMT.25D.2200 } D.2150_302 = *srcp1.0D.2149_301; [gg5.c : 13] a0D.2102_303 = (long unsigned intD.4) D.2150_302; [gg5.c : 14] srcp2.1D.2151_305 = (unsigned charD.10 *) srcp2D.2101_304; [gg5.c : 14] # VUSE <aD.1965_413(D), bD.1970_414(D), aD.1971_415(D), bD.1976_416(D), aD.1977_417(D), bD.1982_418(D), aD.1983_41 9(D), SMT.25D.2200_420(D)> { aD.1965 bD.1970 aD.1971 bD.1976 aD.1977 bD.1982 aD.1983 SMT.25D.2200 } D.2152_306 = *srcp2.1D.2151_305; (a = x and b = b0) before taking their addresses, but *.dceloop1 nukes them, supposedly because VUSE on the *srcp1 and *srcp2 reads is wrong. This is with -O2 -fno-strict-aliasing, but even with strict aliasing taking an address of an unsigned long variable, casting it to unsigned char * and reading it a single byte at a time through unsigned char * pointer is valid.
At *.crited there is still a virtual dependency between a = x; (and b = b0;) and *srcp1 + *srcp2 reads: # MPT.26D.2201_438 = VDEF <MPT.26D.2201_421(D)> { MPT.26D.2201 } aD.2099 = xD.2073_296; # MPT.26D.2201_439 = VDEF <MPT.26D.2201_438> { MPT.26D.2201 } bD.2098 = b0D.2069_297; ... [gg5.c : 13] # VUSE <aD.1965_413(D), bD.1970_414(D), aD.1971_415(D), bD.1976_416(D), aD.1977_417(D), bD.1982_418(D), aD.1983_41 9(D), SMT.25D.2200_420(D), MPT.26D.2201_439> { aD.1965 bD.1970 aD.1971 bD.1976 aD.1977 bD.1982 aD.1983 SMT.25D.2200 MPT.26D.2201 } D.2150_302 = *srcp1.0D.2149_301; ... [gg5.c : 14] # VUSE <aD.1965_413(D), bD.1970_414(D), aD.1971_415(D), bD.1976_416(D), aD.1977_417(D), bD.1982_418(D), aD.1983_41 9(D), SMT.25D.2200_420(D), MPT.26D.2201_439> { aD.1965 bD.1970 aD.1971 bD.1976 aD.1977 bD.1982 aD.1983 SMT.25D.2200 MPT.26D.2201 } D.2152_306 = *srcp2.1D.2151_305; through MPT.26. But in *.pre that's gone and there is no virtual dependency between them anymore.
I will have a look.
During update_alias_info we collect the variables that are written to, but because we use gimple_stored_syms to get at them we miss those that are in memory partitions. If the new partitioning moves them out of a partition we then miss conflicts for them. Oops. I have a patch.
Created attachment 16633 [details] prototype patch Prototype patch. It needs some work as appearantly we cannot use MPT_SYMBOLS reliably there - I get random segfaults / referenced_var_lookup ICEs. As we are only interested in symbols we can just look at the statement manually. But at least for asms that will need some work.
Subject: Bug 38051 Author: rguenth Date: Sat Nov 15 15:37:57 2008 New Revision: 141887 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=141887 Log: 2008-11-15 Richard Guenther <rguenther@suse.de> PR tree-optimization/38051 * tree-ssa-alias.c (update_alias_info_1): Manually find written variables. * gcc.c-torture/execute/pr38051.c: New testcase. Added: trunk/gcc/testsuite/gcc.c-torture/execute/pr38051.c Modified: trunk/gcc/ChangeLog trunk/gcc/testsuite/ChangeLog trunk/gcc/tree-ssa-alias.c
Fixed.
The test fails on powerpc-apple-darwin9 (revision 141945, 32 and 64 bit modes): FAIL: gcc.c-torture/execute/pr38051.c execution, -O0 FAIL: gcc.c-torture/execute/pr38051.c execution, -O1 FAIL: gcc.c-torture/execute/pr38051.c execution, -O2 FAIL: gcc.c-torture/execute/pr38051.c execution, -O3 -fomit-frame-pointer FAIL: gcc.c-torture/execute/pr38051.c execution, -O3 -fomit-frame-pointer -funroll-loops FAIL: gcc.c-torture/execute/pr38051.c execution, -O3 -fomit-frame-pointer -funroll-all-loops -finline-functions FAIL: gcc.c-torture/execute/pr38051.c execution, -O3 -g FAIL: gcc.c-torture/execute/pr38051.c execution, -Os
The new test is also failing on IA64 HP-UX and PA HP-UX (32 and 64 bits). It is not failing on IA64 Linux. Could the test have a big-endian/little-endian issue?
See http://gcc.gnu.org/ml/gcc-patches/2008-11/msg00905.html
Subject: Bug 38051 Author: jakub Date: Tue Nov 18 23:01:35 2008 New Revision: 141983 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=141983 Log: PR tree-optimization/38051 * gcc.c-torture/execute/pr38051.c (buf): Remove aligned attribute. (buf2): Removed. (main): Only run on little endian targets with sizeof (long) == sizeof (void *). Use just one buffer, align the pointers at runtime. Modified: trunk/gcc/testsuite/ChangeLog trunk/gcc/testsuite/gcc.c-torture/execute/pr38051.c
*** Bug 38169 has been marked as a duplicate of this bug. ***
*** Bug 38478 has been marked as a duplicate of this bug. ***