GCC Bugzilla will be upgraded from version 4.4.9 to 5.0rc3 on Saturday, April 25, starting around 17:00 UTC. The upgrade process should only last a few minutes. Check bug 64968 for details.
Bug 20283 - optimising muldiv() type operations
optimising muldiv() type operations
Status: NEW
Product: gcc
Classification: Unclassified
Component: middle-end
4.2.1
: P3 enhancement
: ---
Assigned To: Not yet assigned to anyone
:
Depends on:
Blocks:
  Show dependency treegraph
 
Reported: 2005-03-02 14:25 UTC by Andrew Robb
Modified: 2008-09-01 05:50 UTC (History)
2 users (show)

See Also:
Host: SuSE Linux 9.2
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2006-01-05 14:23:04


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Andrew Robb 2005-03-02 14:25:46 UTC
Reading specs from /usr/lib/gcc-lib/i586-suse-linux/3.3.4/specs
Configured with: ../configure --enable-threads=posix --prefix=/usr
--with-local-prefix=/usr/local --infodir=/usr/share/info --mandir=/usr/share/man
--enable-languages=c,c++,f77,objc,java,ada --disable-checking --libdir=/usr/lib
--enable-libgcj --with-gxx-include-dir=/usr/include/g++ --with-slibdir=/lib
--with-system-zlib --enable-shared --enable-__cxa_atexit i586-suse-linux
Thread model: posix
gcc version 3.3.4 (pre 3.3.5 20040809)
 /usr/lib/gcc-lib/i586-suse-linux/3.3.4/cc1 -E -quiet -v -D__GNUC__=3
-D__GNUC_MINOR__=3 -D__GNUC_PATCHLEVEL__=4 muldiv.c -march=pentium -O3 muldiv.i
#include "..." search starts here:
#include <...> search starts here:
 /usr/local/include
 /usr/lib/gcc-lib/i586-suse-linux/3.3.4/include
 /usr/i586-suse-linux/include
 /usr/include
End of search list.
 /usr/lib/gcc-lib/i586-suse-linux/3.3.4/cc1 -fpreprocessed muldiv.i -quiet
-dumpbase muldiv.c -march=pentium -auxbase muldiv -O3 -version -o muldiv.s
GNU C version 3.3.4 (pre 3.3.5 20040809) (i586-suse-linux)
        compiled by GNU C version 3.3.4 (pre 3.3.5 20040809).
GGC heuristics: --param ggc-min-expand=63 --param ggc-min-heapsize=63486

When compiling the following function:

gcc -O3 -S -march=pentium

int vat(int a)
{
  return a * 47 / 40;
}

optimising misses the trick of combining the multiply and divide into a single
multiply and shift arithmetic right.

# assume value is already in EAX
movl 1261646643,%ecx
imul %ecx
sarl $30 # I mean shift %edx,%eax pair right by 30 bits

I realise I haven't chosen the value properly to give the same results as an
overflow when multiplying by 47, but is that defined in the standard?
Comment 1 Andrew Pinski 2005-03-02 16:35:34 UTC
With unsigned instead of signed, I get the following code with the mainline:
        movl    4(%esp), %eax
        leal    (%eax,%eax,2), %edx
        sall    $4, %edx
        subl    %eax, %edx
        movl    $-858993459, %eax
        mull    %edx
        shrl    $5, %edx
        movl    %edx, %eax
        ret

Is this okay, If so then there is no bug here really.
Comment 2 Andrew Robb 2005-03-02 16:53:02 UTC
Subject: Re:  optimising muldiv() type operations

Hi,
Thanks for getting back. Your code below still performs a separate 
multiply by 47 before 'dividing' by 40.  My enhancement request is to 
combine these two operations making the 2nd-4th (and 8th) lines below 
redundant.

Well done though for spotting it should have been unsigned (signed 
division is horrible as it is not defined/portable).

pinskia at gcc dot gnu dot org wrote:

>------- Additional Comments From pinskia at gcc dot gnu dot org  2005-03-02 16:35 -------
>With unsigned instead of signed, I get the following code with the mainline:
>        movl    4(%esp), %eax
>        leal    (%eax,%eax,2), %edx
>        sall    $4, %edx
>        subl    %eax, %edx
>        movl    $-858993459, %eax
>        mull    %edx
>        shrl    $5, %edx
>        movl    %edx, %eax
>        ret
>
>Is this okay, If so then there is no bug here really.
>
>  
>

Comment 3 Andrew Pinski 2005-06-04 16:44:06 UTC
Confirmed.
Comment 4 Andrew Robb 2008-08-31 07:48:39 UTC
I should point out that for operations (x * y / z) where y > z, there is nothing to be gained on an Intel Core 2 duo (as extra adds and a shrd are required for the scaling to work over the whole domain: 0..(1<<32)/y. However, there are still gains to be had where y < z and a simple 32-bit scale factor can be used.

In the following example, fast is at least 20% faster than std with gcc 3.4 (Cygwin) when compiled without inlining (gcc -O2 -fomit-frame-pointer).

uint32_t std(uint32_t x) {
  return x * 40 / 47;
} 

uint32_t fast(uint32_t x) {
  static const uint64_t fact = ((((uint64_t)40) << 32) + 46) / 47;
  return (x * fact) >> 32;
}

        .def    _std;   .scl    2;      .type   32;     .endef
_std:
        movl    4(%esp), %edx
        movl    $-1370734243, %eax
        leal    (%edx,%edx,4), %edx
        sall    $3, %edx
        mull    %edx
        shrl    $5, %edx
        movl    %edx, %eax
        ret
        .p2align 4,,15
.globl _fast
        .def    _fast;  .scl    2;      .type   32;     .endef
_fast:
        movl    $-639675980, %eax
        mull    4(%esp)
        movl    %edx, %eax
        xorl    %edx, %edx
        ret
Comment 5 Andrew Robb 2008-08-31 13:40:19 UTC
For the case where the overall scale is between 1 and 2 (exclusive), it is straightforward to implement a solution with a single multiply and no shifts:

/* Demostrate fast multiply and divide by rational number, r, where 1 < r < 2
 * Both methods work over entire domain: 0 < x < (1<<32)/r.
 */
#include <stdint.h>
#define Y 47
#define Z 40

uint32_t slow(uint32_t x) {
  return x * (uint64_t) Y / Z;
}

uint32_t fast(uint32_t x) {
  static const uint64_t fact = ((((uint64_t)(Y-Z)) << 32) + (Z-1)) / Z;
  return x + (uint32_t)((x * fact) >> 32);
}

This compiles to the following with gcc 4.2.1 on i586-suse-linux

gcc -O3 -fomit-frame-pointer

        .file   "vat.c"
.globl __udivdi3
        .text
        .p2align 4,,15
.globl slow
        .type   slow, @function
slow:
        subl    $28, %esp
        movl    $47, %eax
        mull    32(%esp)
        movl    $40, 8(%esp)
        movl    $0, 12(%esp)
        movl    %eax, (%esp)
        movl    %edx, 4(%esp)
        call    __udivdi3
        addl    $28, %esp
        ret
        .size   slow, .-slow
        .p2align 4,,15
.globl fast
        .type   fast, @function
fast:
        subl    $12, %esp
        movl    $751619277, %ecx
        movl    %ebx, (%esp)
        movl    16(%esp), %ebx
        movl    %edi, 8(%esp)
        movl    %esi, 4(%esp)
        movl    %ebx, %eax
        mull    %ecx
        movl    %edx, %edi
        movl    %eax, %esi
        movl    %edi, %esi
        xorl    %edi, %edi
        movl    8(%esp), %edi
        leal    (%ebx,%esi), %eax
        movl    (%esp), %ebx
        movl    4(%esp), %esi
        addl    $12, %esp
        ret
        .size   fast, .-fast
# additional hand-coded assembler equivalent to fast above:
        .p2align 4,,15
.globl faster
        .type   faster, @function
faster:
        movl    $751619277, %eax
        mull    4(%esp)
        movl    4(%esp), %eax
        addl    %edx, %eax
        ret
        .size   faster, .-faster
        .ident  "GCC: (GNU) 4.2.1 (SUSE Linux)"
        .section        .note.GNU-stack,"",@progbits
Comment 6 Andrew Robb 2008-09-01 05:50:22 UTC
OK - it's me again. I have formulated a general case optimization - I should keep quiet now ;-) :

/* Demonstrate a general fast multiply and divide by rational number.
 * Both methods agree over the entire domain: 0..0xffffffffu.
 * (even when they overflow the 32-bit result)
 */
#include <stdint.h>
/* 32-bit numerator */
#define Y 47
/* 32-bit denominator */
#define Z 40
/* bit shift (at least 32) to normalize 'fact' below (with MSB set) */
#define B (32 + 2)

/* this is what we might write - but it calls a 64-bit divide function */
uint32_t slow(uint32_t x) {
  return x * (uint64_t) Y / Z;
}

/* this is how it can be implemented without a 64-bit divide call */
uint32_t fast(uint32_t x) {
  static const uint64_t fact = ((((uint64_t)(Y % Z)) << B) + (Z - 1)) / Z;
  /*
   * While this is constructed as 2 multiplies, the value of (Y / Z) will
   * often be small (e.g. 0 or 1) and can be implemented without an imul
   * instruction. This can overwrite the value of %eax after the mul
   * instruction is issued as we only need the value from %edx.
   * A superscalar x86 will assign a new %eax register to allow the mul
   * instruction to continue in parallel.
   *
   * Where the routine is not inline, it can be implemented as:
   *    movl    $-1288490188, %eax
   *    mull    4(%esp)
   *    movl    4(%esp), %eax
   *    # small multiplication by (Y/Z) could go here
   *    shrl    $2, %edx
   *    addl    %edx, %eax
   *    ret
   *
   * Where this routine is inlined and x is already in a register other than
   * %eax or %edx, the final multiply and addition might be achieved with a
   * single lea instruction into %eax. Example: assume that x is in %ebx with
   * the above ratio (Y/Z is 1):
   *    movl    $-1288490188, %eax
   *    mull    %ebx
   *    shrl    $2, %edx
   *    leal    (%edx, %ebx), %eax
   */
  return (uint32_t)((x * fact) >> B) + x * (Y / Z);
}

NOTE: the general case 64-bit result can be calculated with at most 2 mul instructions and no divides.

uint64_t fast64(uint32_t x) {
  static const uint64_t fact = ((((uint64_t)(Y % Z)) << B) + (Z - 1)) / Z;
  return ((x * fact) >> B) + x * (uint64_t)(Y / Z);
}

With the above ratio of 47 / 40, this could be compiled to:

        movl    $-1288490188, %eax
        mull    4(%esp)
        movl    4(%esp), %eax
        shrl    $2, %edx
        addl    %edx, %eax
        xorl    %edx, %edx
        adcl    $0, %edx
        ret

Where the ratio is less than 1, the 32-bit and 64-bit versions are very similar - both using a single mul instruction. (The 64-bit version must zero %edx.)