Bug 38051 - [4.4 Regression] Miscompilation of glibc's memcmp
Summary: [4.4 Regression] Miscompilation of glibc's memcmp
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.4.0
: P1 normal
Target Milestone: 4.4.0
Assignee: Richard Biener
URL:
Keywords: alias, wrong-code
: 38169 38478 (view as bug list)
Depends on:
Blocks:
 
Reported: 2008-11-07 14:39 UTC by Jakub Jelinek
Modified: 2008-12-10 18:43 UTC (History)
5 users (show)

See Also:
Host:
Target: x86_64-linux
Build:
Known to work:
Known to fail:
Last reconfirmed: 2008-11-07 17:35:59


Attachments
prototype patch (2.15 KB, patch)
2008-11-07 19:33 UTC, Richard Biener
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description Jakub Jelinek 2008-11-07 14:39:45 UTC
typedef __SIZE_TYPE__ size_t;
static int mymemcmp1 (unsigned long int, unsigned long int)
  __attribute__ ((__nothrow__));

__inline static int
mymemcmp1 (unsigned long int a, unsigned long int b)
{
  long int srcp1 = (long int) &a;
  long int srcp2 = (long int) &b;
  unsigned long int a0, b0;
  do
    {
      a0 = ((unsigned char *) srcp1)[0];
      b0 = ((unsigned char *) srcp2)[0];
      srcp1 += 1;
      srcp2 += 1;
    }
  while (a0 == b0);
  return a0 - b0;
}

static int mymemcmp2 (long, long, size_t) __attribute__ ((__nothrow__));

static int
mymemcmp2 (long int srcp1, long int srcp2, size_t len)
{
  unsigned long int a0, a1;
  unsigned long int b0, b1;
  switch (len % 4)
    {
    default:
    case 2:
      a0 = ((unsigned long int *) srcp1)[0];
      b0 = ((unsigned long int *) srcp2)[0];
      srcp1 -= 2 * (sizeof (unsigned long int));
      srcp2 -= 2 * (sizeof (unsigned long int));
      len += 2;
      goto do1;
    case 3:
      a1 = ((unsigned long int *) srcp1)[0];
      b1 = ((unsigned long int *) srcp2)[0];
      srcp1 -= (sizeof (unsigned long int));
      srcp2 -= (sizeof (unsigned long int));
      len += 1;
      goto do2;
    case 0:
      if (16 <= 3 * (sizeof (unsigned long int)) && len == 0)
        return 0;
      a0 = ((unsigned long int *) srcp1)[0];
      b0 = ((unsigned long int *) srcp2)[0];
      goto do3;
    case 1:
      a1 = ((unsigned long int *) srcp1)[0];
      b1 = ((unsigned long int *) srcp2)[0];
      srcp1 += (sizeof (unsigned long int));
      srcp2 += (sizeof (unsigned long int));
      len -= 1;
      if (16 <= 3 * (sizeof (unsigned long int)) && len == 0)
        goto do0;
    }
  do
    {
      a0 = ((unsigned long int *) srcp1)[0];
      b0 = ((unsigned long int *) srcp2)[0];
      if (a1 != b1)
        return mymemcmp1 ((a1), (b1));
    do3:
      a1 = ((unsigned long int *) srcp1)[1];
      b1 = ((unsigned long int *) srcp2)[1];
      if (a0 != b0)
        return mymemcmp1 ((a0), (b0));
    do2:
      a0 = ((unsigned long int *) srcp1)[2];
      b0 = ((unsigned long int *) srcp2)[2];
      if (a1 != b1)
        return mymemcmp1 ((a1), (b1));
    do1:
      a1 = ((unsigned long int *) srcp1)[3];
      b1 = ((unsigned long int *) srcp2)[3];
      if (a0 != b0)
        return mymemcmp1 ((a0), (b0));
      srcp1 += 4 * (sizeof (unsigned long int));
      srcp2 += 4 * (sizeof (unsigned long int));
      len -= 4;
    }
  while (len != 0);
do0:
  if (a1 != b1)
    return mymemcmp1 ((a1), (b1));
  return 0;
}

static int mymemcmp3 (long, long, size_t) __attribute__ ((__nothrow__));

static int
mymemcmp3 (long int srcp1, long int srcp2, size_t len)
{
  unsigned long int a0, a1, a2, a3;
  unsigned long int b0, b1, b2, b3;
  unsigned long int x;
  int shl, shr;
  shl = 8 * (srcp1 % (sizeof (unsigned long int)));
  shr = 8 * (sizeof (unsigned long int)) - shl;
  srcp1 &= -(sizeof (unsigned long int));
  switch (len % 4)
    {
    default:
    case 2:
      a1 = ((unsigned long int *) srcp1)[0];
      a2 = ((unsigned long int *) srcp1)[1];
      b2 = ((unsigned long int *) srcp2)[0];
      srcp1 -= 1 * (sizeof (unsigned long int));
      srcp2 -= 2 * (sizeof (unsigned long int));
      len += 2;
      goto do1;
    case 3:
      a0 = ((unsigned long int *) srcp1)[0];
      a1 = ((unsigned long int *) srcp1)[1];
      b1 = ((unsigned long int *) srcp2)[0];
      srcp2 -= 1 * (sizeof (unsigned long int));
      len += 1;
      goto do2;
    case 0:
      if (16 <= 3 * (sizeof (unsigned long int)) && len == 0)
        return 0;
      a3 = ((unsigned long int *) srcp1)[0];
      a0 = ((unsigned long int *) srcp1)[1];
      b0 = ((unsigned long int *) srcp2)[0];
      srcp1 += 1 * (sizeof (unsigned long int));
      goto do3;
    case 1:
      a2 = ((unsigned long int *) srcp1)[0];
      a3 = ((unsigned long int *) srcp1)[1];
      b3 = ((unsigned long int *) srcp2)[0];
      srcp1 += 2 * (sizeof (unsigned long int));
      srcp2 += 1 * (sizeof (unsigned long int));
      len -= 1;
      if (16 <= 3 * (sizeof (unsigned long int)) && len == 0)
        goto do0;
    }
  do
    {
      a0 = ((unsigned long int *) srcp1)[0];
      b0 = ((unsigned long int *) srcp2)[0];
      x = (((a2) >> (shl)) | ((a3) << (shr)));
      if (x != b3)
        return mymemcmp1 ((x), (b3));
    do3:
      a1 = ((unsigned long int *) srcp1)[1];
      b1 = ((unsigned long int *) srcp2)[1];
      x = (((a3) >> (shl)) | ((a0) << (shr)));
      if (x != b0)
        return mymemcmp1 ((x), (b0));
    do2:
      a2 = ((unsigned long int *) srcp1)[2];
      b2 = ((unsigned long int *) srcp2)[2];
      x = (((a0) >> (shl)) | ((a1) << (shr)));
      if (x != b1)
        return mymemcmp1 ((x), (b1));
    do1:
      a3 = ((unsigned long int *) srcp1)[3];
      b3 = ((unsigned long int *) srcp2)[3];
      x = (((a1) >> (shl)) | ((a2) << (shr)));
      if (x != b2)
        return mymemcmp1 ((x), (b2));
      srcp1 += 4 * (sizeof (unsigned long int));
      srcp2 += 4 * (sizeof (unsigned long int));
      len -= 4;
    }
  while (len != 0);
do0:
  x = (((a2) >> (shl)) | ((a3) << (shr)));
  if (x != b3)
    return mymemcmp1 ((x), (b3));
  return 0;
}

__attribute__ ((noinline))
int mymemcmp (const void *s1, const void *s2, size_t len)
{
  unsigned long int a0;
  unsigned long int b0;
  long int srcp1 = (long int) s1;
  long int srcp2 = (long int) s2;
  if (srcp1 % (sizeof (unsigned long int)) == 0)
    return mymemcmp2 (srcp1, srcp2, len / (sizeof (unsigned long int)));
  else
    return mymemcmp3 (srcp1, srcp2, len / (sizeof (unsigned long int)));
}

char buf[256] __attribute__((aligned (16)));
char buf2[256] __attribute__((aligned (16)));

int
main (void)
{
  __builtin_memcpy (buf + 9, "\x1\x37\x82\xa7\x55\x49\x9d\xbf\xf8\x44\xb6\x55\x17\x8e\xf9", 15);
  __builtin_memcpy (buf2 + 24, "\x1\x37\x82\xa7\x55\x49\xd0\xf3\xb7\x2a\x6d\x23\x71\x49\x6a", 15);
  if (mymemcmp (buf + 9, buf2 + 24, 33) != -51)
    __builtin_abort ();
  return 0;
}

is miscompiled at -O2 or -O2 -fno-strict-aliasing, doesn't return the expected -51, but something that depends on the stack content etc. (e.g. prints different
value when run from gdb's inferior and in the program).  Note mymemcmp has
a couple of lines that weren't affecting the result removed.
Comment 1 Jakub Jelinek 2008-11-07 14:57:36 UTC
-fno-tree-pre fixes this.
Comment 2 Richard Biener 2008-11-07 15:29:03 UTC
Confirmed.  PRE doesn't do anything interesting though, neither are the differences in the optimized dump interesting from a first look.
Comment 3 Jakub Jelinek 2008-11-07 16:19:13 UTC
In assembly, it seems the a and b arguments to mymemcmp1 aren't stored into memory at all:
        .loc 1 128 0
        movq    (%rax), %r12    #* srcp2.272, b0
.LVL54:
        .loc 1 126 0
        movq    (%rdi), %rsi    #* srcp1, a3
.LVL55:
        .loc 1 127 0
        movq    8(%rdi), %rbx   #, a0
.LVL56:
        .loc 1 129 0
        addq    $8, %rdi        #, srcp1.270
.LVL57:
.L28:
        .loc 1 152 0
        movl    %r9d, %ecx      #,
.LVL58:
        movq    %rbx, %r10      # a0, tmp154
        .loc 1 149 0
        movq    8(%rdi), %r11   #, a1
.LVL59:
        .loc 1 152 0
        salq    %cl, %r10       #, tmp154
        movl    %r8d, %ecx      #,
        .loc 1 150 0
        movq    8(%rax), %rbp   #, b1
.LVL60:
        .loc 1 152 0
        shrq    %cl, %rsi       #, tmp155
.LVL61:
        orq     %rsi, %r10      # tmp155, tmp156
        cmpq    %r12, %r10      # b0, tmp156
        je      .L26    #,
.LBB112:
.LBB113:
        .loc 1 8 0
        leaq    -8(%rsp), %rcx  #, srcp1
.LVL62:
        .loc 1 9 0
        leaq    -16(%rsp), %rdx #, srcp2
.LVL63:
        .p2align 4,,10
        .p2align 3
.L31:
        .loc 1 13 0
        movzbl  (%rcx), %eax    #* srcp1, a0
.LVL64:
        .loc 1 14 0
        movzbl  (%rdx), %edi    #* srcp2, b0
...

At je .L26 insn %r12 register contains correct value of b0 variable and %r10 of x (shifted + ored together value).  But then suddenly the code assumes it is in memory without actually storing it there.
Comment 4 Jakub Jelinek 2008-11-07 16:32:10 UTC
*.copyprop4 still contains the needed stores:
  [gg5.c : 151] D.2147_293 = a3D.2068_292 >> shlD.2074_195;
  [gg5.c : 151] D.2148_295 = a0D.2065_294 << shrD.2075_198;
  [gg5.c : 151] xD.2073_296 = D.2148_295 | D.2147_293;
  [gg5.c : 152] if (xD.2073_296 != b0D.2069_297)
    goto <bb 51>;
  else
    goto <bb 55> (do2);
  # SUCC: 51 [72.0%]  (true,exec) 55 [28.0%]  (irreducible,false,exec)

  # BLOCK 51 freq:370, starting at line 0
  # PRED: 50 [72.0%]  (true,exec)
  # aD.2099_621 = VDEF <aD.2099_620(D)> { aD.2099 }
  aD.2099 = xD.2073_296;
  # bD.2098_623 = VDEF <bD.2098_622(D)> { bD.2098 }
  bD.2098 = b0D.2069_297;
  [gg5.c : 8] srcp1D.2100_298 = (long intD.2) [gg5.c : 8] &aD.2099;
  [gg5.c : 9] srcp2D.2101_299 = (long intD.2) [gg5.c : 9] &bD.2098;

  # BLOCK 52 freq:2640
  # PRED: 51 [100.0%]  (fallthru,exec) 53 [100.0%]  (fallthru,exec)
  # srcp1D.2100_300 = PHI <srcp1D.2100_298(51), srcp1D.2100_308(53)>
  # srcp2D.2101_304 = PHI <srcp2D.2101_299(51), srcp2D.2101_309(53)>
  [gg5.c : 13] srcp1.0D.2149_301 = (unsigned charD.10 *) srcp1D.2100_300;
  [gg5.c : 13] # VUSE <aD.1965_413(D), bD.1970_414(D), aD.1971_415(D), bD.1976_416(D), aD.1977_417(D), bD.1982_418(D), aD.1983_41
9(D), SMT.25D.2200_420(D)> { aD.1965 bD.1970 aD.1971 bD.1976 aD.1977 bD.1982 aD.1983 SMT.25D.2200 }
  D.2150_302 = *srcp1.0D.2149_301;
  [gg5.c : 13] a0D.2102_303 = (long unsigned intD.4) D.2150_302;
  [gg5.c : 14] srcp2.1D.2151_305 = (unsigned charD.10 *) srcp2D.2101_304;
  [gg5.c : 14] # VUSE <aD.1965_413(D), bD.1970_414(D), aD.1971_415(D), bD.1976_416(D), aD.1977_417(D), bD.1982_418(D), aD.1983_41
9(D), SMT.25D.2200_420(D)> { aD.1965 bD.1970 aD.1971 bD.1976 aD.1977 bD.1982 aD.1983 SMT.25D.2200 }
  D.2152_306 = *srcp2.1D.2151_305;

(a = x and b = b0) before taking their addresses, but *.dceloop1 nukes them, supposedly because VUSE on the *srcp1 and *srcp2 reads is wrong.  This is with -O2 -fno-strict-aliasing, but even with strict aliasing taking an address of an unsigned long variable, casting it to unsigned char * and reading it a single byte at a time through unsigned char * pointer is valid.
Comment 5 Jakub Jelinek 2008-11-07 16:46:17 UTC
At *.crited there is still a virtual dependency between a = x; (and b = b0;) and
*srcp1 + *srcp2 reads:
  # MPT.26D.2201_438 = VDEF <MPT.26D.2201_421(D)> { MPT.26D.2201 }
  aD.2099 = xD.2073_296;
  # MPT.26D.2201_439 = VDEF <MPT.26D.2201_438> { MPT.26D.2201 }
  bD.2098 = b0D.2069_297;
...
  [gg5.c : 13] # VUSE <aD.1965_413(D), bD.1970_414(D), aD.1971_415(D), bD.1976_416(D), aD.1977_417(D), bD.1982_418(D), aD.1983_41
9(D), SMT.25D.2200_420(D), MPT.26D.2201_439> { aD.1965 bD.1970 aD.1971 bD.1976 aD.1977 bD.1982 aD.1983 SMT.25D.2200 MPT.26D.2201
}
  D.2150_302 = *srcp1.0D.2149_301;
...
  [gg5.c : 14] # VUSE <aD.1965_413(D), bD.1970_414(D), aD.1971_415(D), bD.1976_416(D), aD.1977_417(D), bD.1982_418(D), aD.1983_41
9(D), SMT.25D.2200_420(D), MPT.26D.2201_439> { aD.1965 bD.1970 aD.1971 bD.1976 aD.1977 bD.1982 aD.1983 SMT.25D.2200 MPT.26D.2201
}
  D.2152_306 = *srcp2.1D.2151_305;

through MPT.26.  But in *.pre that's gone and there is no virtual dependency between them anymore.
Comment 6 Richard Biener 2008-11-07 17:35:59 UTC
I will have a look.
Comment 7 Richard Biener 2008-11-07 18:43:57 UTC
During update_alias_info we collect the variables that are written to, but
because we use gimple_stored_syms to get at them we miss those that are in
memory partitions.  If the new partitioning moves them out of a partition
we then miss conflicts for them.  Oops.

I have a patch.
Comment 8 Richard Biener 2008-11-07 19:33:01 UTC
Created attachment 16633 [details]
prototype patch

Prototype patch.  It needs some work as appearantly we cannot use MPT_SYMBOLS
reliably there - I get random segfaults / referenced_var_lookup ICEs.

As we are only interested in symbols we can just look at the statement manually.
But at least for asms that will need some work.
Comment 9 Richard Biener 2008-11-15 15:39:20 UTC
Subject: Bug 38051

Author: rguenth
Date: Sat Nov 15 15:37:57 2008
New Revision: 141887

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=141887
Log:
2008-11-15  Richard Guenther  <rguenther@suse.de>

	PR tree-optimization/38051
	* tree-ssa-alias.c (update_alias_info_1): Manually find
	written variables.

	* gcc.c-torture/execute/pr38051.c: New testcase.

Added:
    trunk/gcc/testsuite/gcc.c-torture/execute/pr38051.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/tree-ssa-alias.c

Comment 10 Richard Biener 2008-11-15 15:42:18 UTC
Fixed.
Comment 11 Dominique d'Humieres 2008-11-18 13:38:37 UTC
The test fails on powerpc-apple-darwin9 (revision 141945, 32 and 64 bit modes):

FAIL: gcc.c-torture/execute/pr38051.c execution,  -O0 
FAIL: gcc.c-torture/execute/pr38051.c execution,  -O1 
FAIL: gcc.c-torture/execute/pr38051.c execution,  -O2 
FAIL: gcc.c-torture/execute/pr38051.c execution,  -O3 -fomit-frame-pointer 
FAIL: gcc.c-torture/execute/pr38051.c execution,  -O3 -fomit-frame-pointer -funroll-loops 
FAIL: gcc.c-torture/execute/pr38051.c execution,  -O3 -fomit-frame-pointer -funroll-all-loops -finline-functions 
FAIL: gcc.c-torture/execute/pr38051.c execution,  -O3 -g 
FAIL: gcc.c-torture/execute/pr38051.c execution,  -Os 

Comment 12 Steve Ellcey 2008-11-18 17:31:01 UTC
The new test is also failing on IA64 HP-UX and PA HP-UX (32 and 64 bits).  It is not failing on IA64 Linux.  Could the test have a big-endian/little-endian issue?
Comment 13 Jakub Jelinek 2008-11-18 18:19:31 UTC
See http://gcc.gnu.org/ml/gcc-patches/2008-11/msg00905.html
Comment 14 Jakub Jelinek 2008-11-18 23:03:07 UTC
Subject: Bug 38051

Author: jakub
Date: Tue Nov 18 23:01:35 2008
New Revision: 141983

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=141983
Log:
	PR tree-optimization/38051
	* gcc.c-torture/execute/pr38051.c (buf): Remove aligned attribute.
	(buf2): Removed.
	(main): Only run on little endian targets with
	sizeof (long) == sizeof (void *).  Use just one buffer, align the
	pointers at runtime.

Modified:
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/testsuite/gcc.c-torture/execute/pr38051.c

Comment 15 Richard Biener 2008-11-20 12:15:37 UTC
*** Bug 38169 has been marked as a duplicate of this bug. ***
Comment 16 Richard Biener 2008-12-10 18:43:21 UTC
*** Bug 38478 has been marked as a duplicate of this bug. ***