Totally bogus string optimizations

Joseph S. Myers jsm28@cam.ac.uk
Sat Oct 28 12:20:00 GMT 2000


[Moved onto gcc-bugs as well as libc-alpha - there is a regression here in
how GCC handles glibc's string macros, as well as scope for GCC to handle
more of the optimisations itself.]

On 28 Oct 2000, Ulrich Drepper wrote:

> And no, the compiler is not able to optimize as good.  If the compiler
> would work correctly an strcpy() like your example would be
> transformed into something like this
> 
> 
>     ...get dest address in %eax...
>    movl $0x626f6f46, (%eax)
>    movw $0x7261, 4(%eax)
>    movb $0, 6(%eax)
> 
> 
> which is much better since we have no loading, no relocation (this is
> a data relocation which always has to be performed), and not another
> string constant.

emit_block_move in the compiler needs to be improved so that it will take
data from (at least) non-wide string constants at compile time (and
ideally all string constants, and constant arrays).  As is, CVS GCC will
produce from its builtin strcpy

.LC0:
        .string "Foobar"
        .text
        .align 16
.globl foo
        .type   foo,@function
foo:
        movl    p, %edx
        movl    .LC0, %eax
        movl    %eax, (%edx)
        movzwl  .LC0+4, %eax
        movw    %ax, 4(%edx)
        movzbl  .LC0+6, %eax
        movb    %al, 6(%edx)
        ret

If this gets improved in GCC, then the strcpy macro should best be
disabled for GCC 2.97 and later: the code generated by the macro in GCC
CVS is terrible:

.LC0:
        .string "Foobar"
        .text
        .align 16
.globl foo
        .type   foo,@function
foo:
        movl    $.LC0+1, %eax
        subl    $12, %esp
        cmpl    %eax, %eax
        je      .L158
        subl    $4, %esp
        movl    p, %eax
        pushl   $7
        pushl   $.LC0
        pushl   %eax
        call    memcpy
        addl    $16, %esp
.L143:
        addl    $12, %esp
        ret
        .p2align 4,,7
.L158:
        movl    p, %eax
        movl    $1651470150, (%eax)
        movw    $29281, 4(%eax)
        movb    $0, 6(%eax)
        jmp     .L143

while GCC 2.95.2 generates with the macro the code we want, apart from
leaving the string constant in the code (so there *is* a regression in
GCC):

.LC0:
        .string "Foobar"
.text
        .align 4
.globl foo
        .type    foo,@function
foo:
        movl p,%eax
        movl $1651470150,(%eax)
        movw $29281,4(%eax)
        movb $0,6(%eax)
        ret

(All examples on x86 with -O2 -fomit-frame-pointer.)

Every nested use of the strcpy macro increases the preprocessed size
20-fold, so an innocent line

strcpy(D1, strcpy(D2, strcpy(D3, strcpy(D4, S))))

expands to 10Mbyte of source using glibc.  This is a strong reason to
disable the glibc optimisations when using a compiler version known to do
them itself.

Here is the preprocessed code which GCC CVS handles much worse than GCC
2.95.2:

typedef unsigned int size_t;

extern void *memcpy (void *__restrict __dest,
                     __const void *__restrict __src, size_t __n);

extern char *strcpy (char *__restrict __dest, __const char *__restrict __src);

extern size_t strlen (__const char *__s) __attribute__ ((__pure__));

typedef unsigned short int __uint16_t;
typedef unsigned int __uint32_t;

extern __inline char *__strcpy_small (char *, __uint16_t, __uint16_t,
                                      __uint32_t, __uint32_t, size_t);
extern __inline char *
__strcpy_small (char *__dest,
                __uint16_t __src0_2, __uint16_t __src4_2,
                __uint32_t __src0_4, __uint32_t __src4_4,
                size_t __srclen)
{
  union {
    __uint32_t __ui;
    __uint16_t __usi;
    unsigned char __uc;
  } *__u = (void *) __dest;
  switch ((unsigned int) __srclen)
    {
    case 1:
      __u->__uc = '\0';
      break;
    case 2:
      __u->__usi = __src0_2;
      break;
    case 3:
      __u->__usi = __src0_2;
      __u = (void *) ((char *) __u + 2);
      __u->__uc = '\0';
      break;
    case 4:
      __u->__ui = __src0_4;
      break;
    case 5:
      __u->__ui = __src0_4;
      __u = (void *) ((char *) __u + 4);
      __u->__uc = '\0';
      break;
    case 6:
      __u->__ui = __src0_4;
      __u = (void *) ((char *) __u + 4);
      __u->__usi = __src4_2;
      break;
    case 7:
      __u->__ui = __src0_4;
      __u = (void *) ((char *) __u + 4);
      __u->__usi = __src4_2;
      __u = (void *) ((char *) __u + 2);
      __u->__uc = '\0';
      break;
    case 8:
      __u->__ui = __src0_4;
      __u = (void *) ((char *) __u + 4);
      __u->__ui = __src4_4;
      break;
    }
  return __dest;
}

char *p;

void
foo (void)
{
  (__extension__ (__builtin_constant_p ("Foobar") ?  (((size_t)(const
  void *)(("Foobar") + 1) - (size_t)(const void *)("Foobar") == 1) &&
  strlen ("Foobar") + 1 <= 8 ? __strcpy_small (p, __extension__
  (((__const unsigned char *) (__const char *) ("Foobar"))[0 + 1] << 8
  | ((__const unsigned char *) (__const char *) ("Foobar"))[0]),
  __extension__ (((__const unsigned char *) (__const char *)
  ("Foobar"))[4 + 1] << 8 | ((__const unsigned char *) (__const char
  *) ("Foobar"))[4]), __extension__ (((((__const unsigned char *)
  (__const char *) ("Foobar"))[0 + 3] << 8 | ((__const unsigned char
  *) (__const char *) ("Foobar"))[0 + 2]) << 8 | ((__const unsigned
  char *) (__const char *) ("Foobar"))[0 + 1]) << 8 | ((__const
  unsigned char *) (__const char *) ("Foobar"))[0]), __extension__
  (((((__const unsigned char *) (__const char *) ("Foobar"))[4 + 3] <<
  8 | ((__const unsigned char *) (__const char *) ("Foobar"))[4 + 2])
  << 8 | ((__const unsigned char *) (__const char *) ("Foobar"))[4 +
  1]) << 8 | ((__const unsigned char *) (__const char *)
  ("Foobar"))[4]), strlen ("Foobar") + 1) : (char *) memcpy (p,
  "Foobar", strlen ("Foobar") + 1)) : strcpy (p, "Foobar")));
}

-- 
Joseph S. Myers
jsm28@cam.ac.uk



More information about the Gcc-bugs mailing list