[Bug c++/85721] New: bad codegen for looped copy of primitives at -O2 and -O3 (differently bad)

Wed May 9 18:56:00 GMT 2018

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85721

            Bug ID: 85721
           Summary: bad codegen for looped copy of primitives at -O2 and
                    -O3 (differently bad)
           Product: gcc
           Version: 8.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: redbeard0531 at gmail dot com
  Target Milestone: ---

https://godbolt.org/g/Gg9fFt

Related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85720, but filed
separately because this also affects -O3. Similarly, while this affects types
other than char, char is most egregious.

using SIZE_T = decltype(sizeof(0));
void copy(char* out, const char* in, SIZE_T n) {
    for (SIZE_T i = 0; i < n; i++){
        out[i] = in[i];
    }
}

This should probably just be compiled to check size then jmp memmove. At O2 it
copies byte-by-byte:

copy(char*, char const*, unsigned long):
        test    rdx, rdx
        je      .L1
        xor     eax, eax
.L3:
        movzx   ecx, BYTE PTR [rsi+rax]
        mov     BYTE PTR [rdi+rax], cl
        add     rax, 1
        cmp     rdx, rax
        jne     .L3
.L1:
        ret

At O3 it generates a TON of code:

copy(char*, char const*, unsigned long):
  test rdx, rdx
  je .L1
  lea rax, [rsi+16]
  cmp rdi, rax
  lea rax, [rdi+16]
  setnb cl
  cmp rsi, rax
  setnb al
  or cl, al
  je .L7
  lea rax, [rdx-1]
  cmp rax, 14
  jbe .L7
  mov rcx, rdx
  xor eax, eax
  and rcx, -16
.L4:
  movdqu xmm0, XMMWORD PTR [rsi+rax]
  movups XMMWORD PTR [rdi+rax], xmm0
  add rax, 16
  cmp rax, rcx
  jne .L4
  mov rax, rdx
  and rax, -16
  cmp rdx, rax
  je .L1
  movzx ecx, BYTE PTR [rsi+rax]
  mov BYTE PTR [rdi+rax], cl
  lea rcx, [rax+1]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+1+rax]
  mov BYTE PTR [rdi+1+rax], cl
  lea rcx, [rax+2]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+2+rax]
  mov BYTE PTR [rdi+2+rax], cl
  lea rcx, [rax+3]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+3+rax]
  mov BYTE PTR [rdi+3+rax], cl
  lea rcx, [rax+4]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+4+rax]
  mov BYTE PTR [rdi+4+rax], cl
  lea rcx, [rax+5]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+5+rax]
  mov BYTE PTR [rdi+5+rax], cl
  lea rcx, [rax+6]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+6+rax]
  mov BYTE PTR [rdi+6+rax], cl
  lea rcx, [rax+7]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+7+rax]
  mov BYTE PTR [rdi+7+rax], cl
  lea rcx, [rax+8]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+8+rax]
  mov BYTE PTR [rdi+8+rax], cl
  lea rcx, [rax+9]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+9+rax]
  mov BYTE PTR [rdi+9+rax], cl
  lea rcx, [rax+10]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+10+rax]
  mov BYTE PTR [rdi+10+rax], cl
  lea rcx, [rax+11]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+11+rax]
  mov BYTE PTR [rdi+11+rax], cl
  lea rcx, [rax+12]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+12+rax]
  mov BYTE PTR [rdi+12+rax], cl
  lea rcx, [rax+13]
  cmp rdx, rcx
  jbe .L1
  movzx ecx, BYTE PTR [rsi+13+rax]
  mov BYTE PTR [rdi+13+rax], cl
  lea rcx, [rax+14]
  cmp rdx, rcx
  jbe .L1
  movzx edx, BYTE PTR [rsi+14+rax]
  mov BYTE PTR [rdi+14+rax], dl
  ret
.L7:
  xor eax, eax
.L3:
  movzx ecx, BYTE PTR [rsi+rax]
  mov BYTE PTR [rdi+rax], cl
  add rax, 1
  cmp rdx, rax
  jne .L3
.L1:
  ret

A) This should probably just call memmove which has a tuned implementation for
many architectures and uses ifunc dispatch to choose the right one based on the
runtime CPU rather than the compile-time settings. Also, all functions like
this for all types would all just jump to a single function, there should be I$
advantages.

B) If you really want to emit code for this rather than calling into libc, it
is probably best to use their technique of overlapped reads and writes for the
last vector rather than going into an unrolled byte-by-byte loop:
https://github.molgen.mpg.de/git-mirror/glibc/blob/20003c49884422da7ffbc459cdeee768a6fee07b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S#L331-L335