[Bug c++/85721] New: bad codegen for looped copy of primitives at -O2 and -O3 (differently bad)
redbeard0531 at gmail dot com
gcc-bugzilla@gcc.gnu.org
Wed May 9 18:56:00 GMT 2018
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85721
Bug ID: 85721
Summary: bad codegen for looped copy of primitives at -O2 and
-O3 (differently bad)
Product: gcc
Version: 8.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: redbeard0531 at gmail dot com
Target Milestone: ---
https://godbolt.org/g/Gg9fFt
Related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85720, but filed
separately because this also affects -O3. Similarly, while this affects types
other than char, char is most egregious.
using SIZE_T = decltype(sizeof(0));
void copy(char* out, const char* in, SIZE_T n) {
for (SIZE_T i = 0; i < n; i++){
out[i] = in[i];
}
}
This should probably just be compiled to check size then jmp memmove. At O2 it
copies byte-by-byte:
copy(char*, char const*, unsigned long):
test rdx, rdx
je .L1
xor eax, eax
.L3:
movzx ecx, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], cl
add rax, 1
cmp rdx, rax
jne .L3
.L1:
ret
At O3 it generates a TON of code:
copy(char*, char const*, unsigned long):
test rdx, rdx
je .L1
lea rax, [rsi+16]
cmp rdi, rax
lea rax, [rdi+16]
setnb cl
cmp rsi, rax
setnb al
or cl, al
je .L7
lea rax, [rdx-1]
cmp rax, 14
jbe .L7
mov rcx, rdx
xor eax, eax
and rcx, -16
.L4:
movdqu xmm0, XMMWORD PTR [rsi+rax]
movups XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rcx
jne .L4
mov rax, rdx
and rax, -16
cmp rdx, rax
je .L1
movzx ecx, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], cl
lea rcx, [rax+1]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+1+rax]
mov BYTE PTR [rdi+1+rax], cl
lea rcx, [rax+2]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+2+rax]
mov BYTE PTR [rdi+2+rax], cl
lea rcx, [rax+3]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+3+rax]
mov BYTE PTR [rdi+3+rax], cl
lea rcx, [rax+4]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+4+rax]
mov BYTE PTR [rdi+4+rax], cl
lea rcx, [rax+5]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+5+rax]
mov BYTE PTR [rdi+5+rax], cl
lea rcx, [rax+6]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+6+rax]
mov BYTE PTR [rdi+6+rax], cl
lea rcx, [rax+7]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+7+rax]
mov BYTE PTR [rdi+7+rax], cl
lea rcx, [rax+8]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+8+rax]
mov BYTE PTR [rdi+8+rax], cl
lea rcx, [rax+9]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+9+rax]
mov BYTE PTR [rdi+9+rax], cl
lea rcx, [rax+10]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+10+rax]
mov BYTE PTR [rdi+10+rax], cl
lea rcx, [rax+11]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+11+rax]
mov BYTE PTR [rdi+11+rax], cl
lea rcx, [rax+12]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+12+rax]
mov BYTE PTR [rdi+12+rax], cl
lea rcx, [rax+13]
cmp rdx, rcx
jbe .L1
movzx ecx, BYTE PTR [rsi+13+rax]
mov BYTE PTR [rdi+13+rax], cl
lea rcx, [rax+14]
cmp rdx, rcx
jbe .L1
movzx edx, BYTE PTR [rsi+14+rax]
mov BYTE PTR [rdi+14+rax], dl
ret
.L7:
xor eax, eax
.L3:
movzx ecx, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], cl
add rax, 1
cmp rdx, rax
jne .L3
.L1:
ret
A) This should probably just call memmove which has a tuned implementation for
many architectures and uses ifunc dispatch to choose the right one based on the
runtime CPU rather than the compile-time settings. Also, all functions like
this for all types would all just jump to a single function, there should be I$
advantages.
B) If you really want to emit code for this rather than calling into libc, it
is probably best to use their technique of overlapped reads and writes for the
last vector rather than going into an unrolled byte-by-byte loop:
https://github.molgen.mpg.de/git-mirror/glibc/blob/20003c49884422da7ffbc459cdeee768a6fee07b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S#L331-L335
More information about the Gcc-bugs
mailing list