[Bug rtl-optimization/97459] __uint128_t remainder for division by 3
tkoenig at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Tue Oct 20 18:16:52 GMT 2020
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97459
--- Comment #9 from Thomas Koenig <tkoenig at gcc dot gnu.org> ---
(In reply to Jakub Jelinek from comment #7)
> So, can we use this for anything but modulo 3, or 5, or 17, or 257 (all of
> those have 2^32 mod N == 2^64 mod N == 2^128 mod N == 1)
I think so, too.
> probably also
> keyed on the target having corresponding uaddv4_optab handler, normal
> expansion not being able to handle it and emitting a libcall?
Again, yes.
This can also be used as a building block for handling division
and remainder base 10.
Here's a benchmark for this (it uses the sum of digits base 10
instead). qsum1 uses the standard method, which you can find
(for example) in libgfortran.
div_rem5_v2 first calculates the remainder of the division by 5 using this
method, then does an exact division by multiplying with its modular inverse
for 2^128.
div_rem10_v2 then uses div_rem5_v2 to calculate the value and
remainder of the division by 10, and qsum_v2 uses that to
calculate the sum of digits.
The timings are about a factor of 2 faster than the straightforward
libcall version:
s = 360398898 qsum_v1: 1.091621 s
s = 360398898 qsum_v2: 0.485509 s
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/time.h>
#include <stdlib.h>
#define ONE ((__uint128_t) 1)
#define TWO_64 (ONE << 64)
typedef __uint128_t mytype;
double this_time ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
unsigned
qsum_v1 (mytype n)
{
unsigned ret;
ret = 0;
while (n > 0)
{
ret += n % 10;
n = n / 10;
}
return ret;
}
static void inline __attribute__((always_inline))
div_rem_5_v2 (mytype n, mytype *div, unsigned *rem)
{
unsigned long a, b, c;
/* The modular inverse to 5 modulo 2^128 */
const mytype magic = (0xCCCCCCCCCCCCCCCC * TWO_64 + 0xCCCCCCCCCCCCCCCD *
ONE);
b = n >> 64;
c = n;
if (__builtin_add_overflow (b, c, &a))
a++;
*rem = a % 5;
*div = (n-*rem) * magic;
}
static void inline __attribute__((always_inline))
div_rem_10_v2 (mytype n, mytype *div, unsigned *rem)
{
mytype n5;
unsigned rem5;
div_rem_5_v2 (n, &n5, &rem5);
*rem = rem5 + (n5 % 2) * 5;
*div = n5/2;
}
unsigned
qsum_v2 (mytype n)
{
unsigned ret;
unsigned rem;
mytype n_new;
ret = 0;
while (n > 0)
{
div_rem_10_v2 (n, &n_new, &rem);
ret += rem;
n = n_new;
}
return ret;
}
#define N 10000000
int main()
{
mytype *a;
unsigned long int s;
double t1, t2;
int fd;
long int i;
a = malloc (sizeof (*a) * N);
fd = open ("/dev/urandom", O_RDONLY);
read (fd, a, sizeof (*a) * N);
s = 0;
t1 = this_time();
for (i=0; i<N; i++)
s += qsum_v1(a[i]);
t2 = this_time();
printf ("s = %lu qsum_v1: %f s\n", s, (t2-t1));
s = 0;
t1 = this_time();
for (i=0; i<N; i++)
s += qsum_v2(a[i]);
t2 = this_time();
printf ("s = %lu qsum_v2: %f s\n", s, (t2-t1));
return 0;
}
More information about the Gcc-bugs
mailing list