[Bug target/96234] New: Sub-optimal register allocation with a signed integer literal.

Fri Jul 17 15:01:38 GMT 2020

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96234

            Bug ID: 96234
           Summary: Sub-optimal register allocation with a signed integer
                    literal.
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: maxim.yegorushkin at gmail dot com
  Target Milestone: ---

The following code:

    #include <time.h>
    #include <stdint.h>

    namespace A {

    inline uint64_t as_nanoseconds(struct timespec* ts) {
        return ts->tv_sec * 1000000000L + ts->tv_nsec;
    }

    uint64_t f(uint64_t c, struct timespec* a, struct timespec* b) {
        return c + (as_nanoseconds(a) - as_nanoseconds(b));
    }

    }

    namespace B {

    inline uint64_t as_nanoseconds(struct timespec* ts) {
        return ts->tv_sec * 1000000000UL + ts->tv_nsec;
    }

    uint64_t f(uint64_t c, struct timespec* a, struct timespec* b) {
        return c + (as_nanoseconds(a) - as_nanoseconds(b));
    }

    }

When compiled with `gcc-10.1 -O3 -march=skylake` produces a superflows
instruction in the version a signed constant `1000000000L`:

    A::f(unsigned long, timespec*, timespec*):
            mov     r8, rdx <----------------------------- superflows
instruction
            imul    rax, QWORD PTR [rsi], 1000000000
            imul    rdx, QWORD PTR [rdx], 1000000000
            add     rax, QWORD PTR [rsi+8]
            add     rdx, QWORD PTR [r8+8]
            sub     rax, rdx
            add     rax, rdi
            ret
    B::f(unsigned long, timespec*, timespec*):
            imul    rax, QWORD PTR [rsi], 1000000000
            add     rdi, QWORD PTR [rsi+8]
            sub     rdi, QWORD PTR [rdx+8]
            imul    rdx, QWORD PTR [rdx], 1000000000
            add     rax, rdi
            sub     rax, rdx
            ret

`clang` produces the same code for both versions and also optimizes away one
multiplication:

    A::f(unsigned long, timespec*, timespec*):                 # @A::f(unsigned
long, timespec*, timespec*)
            mov     rax, qword ptr [rsi]
            sub     rax, qword ptr [rdx]
            imul    rax, rax, 1000000000
            add     rdi, qword ptr [rsi + 8]
            sub     rdi, qword ptr [rdx + 8]
            add     rax, rdi
            ret
    B::f(unsigned long, timespec*, timespec*):                 # @B::f(unsigned
long, timespec*, timespec*)
            mov     rax, qword ptr [rsi]
            sub     rax, qword ptr [rdx]
            imul    rax, rax, 1000000000
            add     rdi, qword ptr [rsi + 8]
            sub     rdi, qword ptr [rdx + 8]
            add     rax, rdi
            ret

https://gcc.godbolt.org/z/Kf4q7z