[Bug target/100348] New: RISC-V extra pointer adjustments for memcpy() from glibc

Thu Apr 29 23:40:25 GMT 2021

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100348

            Bug ID: 100348
           Summary: RISC-V extra pointer adjustments for memcpy() from
                    glibc
           Product: gcc
           Version: 10.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: brian.grayson at sifive dot com
  Target Milestone: ---

Created attachment 50714
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=50714&action=edit
Source C code for trimmed example

The wordcopy_fwd_aligned() portion of glibc contains generated code like the
following assembly dump:

.L16:
  ld  a6,16(a1)
  addi  a3,a5,24
  sd  a4,0(a0)
.L15:
  ld  a4,24(a1)
  addi  a0,a5,32
  sd  a6,0(a3)
.L14:
  ld  a6,32(a1)
  addi  a3,a5,40
  sd  a4,0(a0)
.L13:
  ld  a4,40(a1)
  addi  a0,a5,48
  sd  a6,0(a3)
.L12:
  ld  a6,48(a1)
  addi  a3,a5,56
  sd  a4,0(a0)

For some reason, gcc is able to do the proper immediate-offset thinking so that
the loads use different offsets and a fixed base register (until we get to the
bottom of the loop, and bump the a1 pointer), but for the stores, it bumps the
pointer each time. This adds 50% more instructions to the innermost loop of
memcpy() arguably a bit of code we should try to have optimal code for.

I've simplified the code here, to make it about as small as it can be and still
demonstrate the issue.

Here is the generated assembly, with my commentary to the right in comments:

        .file   "simple.c"
        .option nopic
        .attribute arch, "rv64i2p0_m2p0_a2p0_f2p0_d2p0_c2p0"
        .attribute unaligned_access, 0
        .attribute stack_align, 16
        .text
        .align  1
        .globl  not_really_wordcopy_fwd_aligned
        .type   not_really_wordcopy_fwd_aligned, @function
not_really_wordcopy_fwd_aligned:
        andi    a5,a2,3
        li      a4,2
        beq     a5,a4,.L2
        li      a4,3
        beq     a5,a4,.L3
        li      a4,1
        beq     a5,a4,.L20
        bne     a2,zero,.L21
        ret
.L20:
        addi    a2,a2,-1
        ld      a3,0(a1)
        bne     a2,zero,.L22
.L10:
        sd      a3,0(a0)
.L23:
        ret
.L3:
        ld      a3,0(a1)
        addi    a4,a1,-40
        addi    a5,a0,-48
        addi    a2,a2,5
        addi    a1,a1,-8
        addi    a7,a0,-16
        j       .L7
.L11:
        sd      a3,32(a5) // Uses an immediate offset off of a5. Good.
        ld      a3,32(a4)
        addi    a7,a5,32
        addi    a1,a4,32
.L9:
        sd      a3,40(a5) // Uses an immediate offset off of a5. Good again.
        ld      a3,40(a4)
        addi    a0,a5,48  // Uh-oh, bumping a pointer to be a5+48.
.L7:
        sd      a3,0(a0)  // Uh-oh, using the pointer. Why not 48(a5)?
        ld      a6,48(a4)
        addi    a0,a5,56
        mv      a3,a4
        mv      a5,a7
        mv      a4,a1
.L6:
        sd      a6,0(a0)
        addi    a2,a2,-4
        ld      a3,56(a3)
        mv      a0,a5
        bne     a2,zero,.L11
        sd      a3,0(a0)
        j       .L23
.L2:
        ld      a6,0(a1)
        addi    a3,a1,-48
        addi    a2,a2,6
        addi    a4,a1,-16
        addi    a5,a0,-24
        j       .L6
.L22:
        addi    a4,a1,8
        mv      a5,a0
        j       .L11
.L21:
        ld      a3,0(a1)
        mv      a4,a1
        addi    a5,a0,-8
        addi    a1,a1,32
        addi    a7,a0,24
        j       .L9
        .size   not_really_wordcopy_fwd_aligned,
.-not_really_wordcopy_fwd_aligned
        .ident  "GCC: (SiFive GCC-Metal 10.2.0-2020.12.9) 10.2.0"

I realize that the different paths to L7 make this a bit confusing, but it
seems that since the only path to L7 is through the code at L3 or through the
fallthrough path where it is currently adjusting the pointer, the proper
adjustments could be made before, outside the loop, instead of each time.

In the full memcpy() source code, this behavior occurs multiple times, roughly
for every store, as shown at the top of this bug report.