[Bug rtl-optimization/97459] __uint128_t remainder for division by 3
tkoenig at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Sat Oct 17 12:22:02 GMT 2020
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97459
--- Comment #4 from Thomas Koenig <tkoenig at gcc dot gnu.org> ---
Here's a complete program for benchmarks on x86_64, using Jakub's
functions (so they are indeed correct):
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <x86intrin.h>
unsigned r3_128u_v2 (__uint128_t n)
{
return (unsigned) (n%3);
}
unsigned r3_128u_v3 (__uint128_t n)
{
unsigned long a;
a = (n >> 88);
a += (n >> 44) & 0xfffffffffffULL;
a += (n & 0xfffffffffffULL);
return a % 3;
}
unsigned r3_128u_v4 (__uint128_t n)
{
unsigned long a;
a = (n >> 96);
a += (n >> 64) & 0xffffffffULL;
a += (n >> 32) & 0xffffffffULL;
a += (n & 0xffffffffULL);
return a % 3;
}
#define N 1000000
int main()
{
__uint128_t *a;
unsigned int s;
unsigned long t1, t2;
int fd;
int i;
a = malloc (sizeof (*a) * N);
fd = open ("/dev/random", O_RDONLY);
read (fd, a, sizeof (*a) * N);
s = 0;
t1 = __rdtsc();
for (i=0; i<N; i++)
s += r3_128u_v2(a[i]);
t2 = __rdtsc();
printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double)
N);
s = 0;
t1 = __rdtsc();
for (i=0; i<N; i++)
s += r3_128u_v3(a[i]);
t2 = __rdtsc();
printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double)
N);
s = 0;
t1 = __rdtsc();
for (i=0; i<N; i++)
s += r3_128u_v4(a[i]);
t2 = __rdtsc();
printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double)
N);
}
And here are the results on my box using -O3 -march=native:
s = 7 r3_128u_v2: 22.204618 cycles per iteration
s = 7 r3_128u_v2: 8.143544 cycles per iteration
s = 7 r3_128u_v2: 6.110718 cycles per iteration
More information about the Gcc-bugs
mailing list