[Bug c/100320] New: regression: 32-bit x86 memcpy is suboptimal

vda.linux at googlemail dot com gcc-bugzilla@gcc.gnu.org
Wed Apr 28 15:09:25 GMT 2021


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100320

            Bug ID: 100320
           Summary: regression: 32-bit x86 memcpy is suboptimal
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vda.linux at googlemail dot com
  Target Milestone: ---

Bug 21329 has returned.

32-bit x86 memory block moves are using "movl $LEN,%ecx; rep movsl" insns.

However, for fixed short blocks it is more efficient to just repeat a few
"movsl" insns - this allows to drop "mov $LEN,%ecx" insn.

It's shorter, and more importantly, "rep movsl" are slow-start microcoded insns
(they are faster than moves using general-purpose registers only on blocks
larger than 100-200 bytes) - OTOH, bare "movsl" are not microcoded and take ~4
cycles to execute.

21329 was closed with it fixed:

CVSROOT:        /cvs/gcc
Module name:    gcc
Branch:         gcc-4_0-rhl-branch
Changes by:     jakub@gcc.gnu.org       2005-05-18 19:08:44
Modified files:
        gcc            : ChangeLog 
        gcc/config/i386: i386.c 
Log message:
        2005-05-06  Denis Vlasenko  <vda@port.imtp.ilyichevsk.odessa.ua>
        Jakub Jelinek  <jakub@redhat.com>       
        PR target/21329
        * config/i386/i386.c (ix86_expand_movmem): Don't use rep; movsb
        for -Os if (movsl;)*(movsw;)?(movsb;)? sequence is shorter.
        Don't use rep; movs{l,q} if the repetition count is really small,
        instead use a sequence of movs{l,q} instructions.

(the above is commit 95935e2db5c45bef5631f51538d1e10d8b5b7524 in
gcc.gnu.org/git/gcc.git,
seems that code was largely replaced by:
commit 8c996513856f2769aee1730cb211050fef055fb5
Author: Jan Hubicka <jh@suse.cz>
Date:   Mon Nov 27 17:00:26 2006 +010
    expr.c (emit_block_move_via_libcall): Export.
)


With gcc version 11.0.0 20210210 (Red Hat 11.0.0-0) (GCC) I see "rep movsl"s
again:

void *f(void *d, const void *s)
{ return memcpy(d, s, 16); }

$ gcc -Os -m32 -fomit-frame-pointer -c -o z.o z.c && objdump -drw z.o
z.o:     file format elf32-i386
Disassembly of section .text:
00000000 <f>:
   0:   57                      push   %edi
   1:   b9 04 00 00 00          mov    $0x4,%ecx
   6:   56                      push   %esi
   7:   8b 44 24 0c             mov    0xc(%esp),%eax
   b:   8b 74 24 10             mov    0x10(%esp),%esi
   f:   89 c7                   mov    %eax,%edi
  11:   f3 a5                   rep movsl %ds:(%esi),%es:(%edi)
  13:   5e                      pop    %esi
  14:   5f                      pop    %edi
  15:   c3                      ret 

The expected code would not have "mov $0x4,%ecx" and would have "rep movsl"
replaced by "movsl;movsl;movsl;movsl".

The testcase from 21329 with implicit block moves via struct copies, from here
        https://gcc.gnu.org/bugzilla/attachment.cgi?id=8790
also demonstrates it:

$ gcc -Os -m32 -fomit-frame-pointer -c -o z1.o z1.c && objdump -drw z1.o
z1.o:     file format elf32-i386
Disassembly of section .text:
00000000 <f10>:
   0:   a1 00 00 00 00          mov    0x0,%eax 1: R_386_32     w10
   5:   a3 00 00 00 00          mov    %eax,0x0 6: R_386_32     t10
   a:   c3                      ret    
0000000b <f20>:
   b:   a1 00 00 00 00          mov    0x0,%eax c: R_386_32     w20
  10:   8b 15 04 00 00 00       mov    0x4,%edx 12: R_386_32    w20
  16:   a3 00 00 00 00          mov    %eax,0x0 17: R_386_32    t20
  1b:   89 15 04 00 00 00       mov    %edx,0x4 1d: R_386_32    t20
  21:   c3                      ret    
00000022 <f21>:
  22:   57                      push   %edi
  23:   b9 09 00 00 00          mov    $0x9,%ecx
  28:   bf 00 00 00 00          mov    $0x0,%edi        29: R_386_32    t21
  2d:   56                      push   %esi
  2e:   be 00 00 00 00          mov    $0x0,%esi        2f: R_386_32    w21
  33:   f3 a4                   rep movsb %ds:(%esi),%es:(%edi)
  35:   5e                      pop    %esi
  36:   5f                      pop    %edi
  37:   c3                      ret    
00000038 <f22>:
  38:   57                      push   %edi
  39:   b9 0a 00 00 00          mov    $0xa,%ecx
  3e:   bf 00 00 00 00          mov    $0x0,%edi        3f: R_386_32    t22
  43:   56                      push   %esi
  44:   be 00 00 00 00          mov    $0x0,%esi        45: R_386_32    w22
  49:   f3 a4                   rep movsb %ds:(%esi),%es:(%edi)
  4b:   5e                      pop    %esi
  4c:   5f                      pop    %edi
  4d:   c3                      ret    
0000004e <f23>:
  4e:   57                      push   %edi
  4f:   b9 0b 00 00 00          mov    $0xb,%ecx
  54:   bf 00 00 00 00          mov    $0x0,%edi        55: R_386_32    t23
  59:   56                      push   %esi
  5a:   be 00 00 00 00          mov    $0x0,%esi        5b: R_386_32    w23
  5f:   f3 a4                   rep movsb %ds:(%esi),%es:(%edi)
  61:   5e                      pop    %esi
  62:   5f                      pop    %edi
  63:   c3                      ret    
00000064 <f30>:
  64:   57                      push   %edi
  65:   b9 03 00 00 00          mov    $0x3,%ecx
  6a:   bf 00 00 00 00          mov    $0x0,%edi        6b: R_386_32    t30
  6f:   56                      push   %esi
  70:   be 00 00 00 00          mov    $0x0,%esi        71: R_386_32    w30
  75:   f3 a5                   rep movsl %ds:(%esi),%es:(%edi)
  77:   5e                      pop    %esi
  78:   5f                      pop    %edi
  79:   c3                      ret    
0000007a <f40>:
  7a:   57                      push   %edi
  7b:   b9 04 00 00 00          mov    $0x4,%ecx
  80:   bf 00 00 00 00          mov    $0x0,%edi        81: R_386_32    t40
  85:   56                      push   %esi
  86:   be 00 00 00 00          mov    $0x0,%esi        87: R_386_32    w40
  8b:   f3 a5                   rep movsl %ds:(%esi),%es:(%edi)
  8d:   5e                      pop    %esi
  8e:   5f                      pop    %edi
  8f:   c3                      ret    
00000090 <f50>:
  90:   57                      push   %edi
  91:   b9 05 00 00 00          mov    $0x5,%ecx
  96:   bf 00 00 00 00          mov    $0x0,%edi        97: R_386_32    t50
  9b:   56                      push   %esi
  9c:   be 00 00 00 00          mov    $0x0,%esi        9d: R_386_32    w50
  a1:   f3 a5                   rep movsl %ds:(%esi),%es:(%edi)
  a3:   5e                      pop    %esi
  a4:   5f                      pop    %edi
  a5:   c3                      ret    
000000a6 <f60>:
  a6:   57                      push   %edi
  a7:   b9 06 00 00 00          mov    $0x6,%ecx
  ac:   bf 00 00 00 00          mov    $0x0,%edi        ad: R_386_32    t60
  b1:   56                      push   %esi
  b2:   be 00 00 00 00          mov    $0x0,%esi        b3: R_386_32    w60
  b7:   f3 a5                   rep movsl %ds:(%esi),%es:(%edi)
  b9:   5e                      pop    %esi
  ba:   5f                      pop    %edi
  bb:   c3                      ret    
000000bc <f>:
...


More information about the Gcc-bugs mailing list