[Bug rtl-optimization/97459] __uint128_t remainder for division by 3

tkoenig at gcc dot gnu.org gcc-bugzilla@gcc.gnu.org
Sat Oct 17 12:22:02 GMT 2020


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97459

--- Comment #4 from Thomas Koenig <tkoenig at gcc dot gnu.org> ---
Here's a complete program for benchmarks on x86_64, using Jakub's
functions (so they are indeed correct):

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <x86intrin.h>

unsigned r3_128u_v2 (__uint128_t n)
{
  return (unsigned) (n%3);
}

unsigned r3_128u_v3 (__uint128_t n)
{
  unsigned long a;
  a = (n >> 88);
  a += (n >> 44) & 0xfffffffffffULL;
  a += (n & 0xfffffffffffULL);
  return a % 3;
}

unsigned r3_128u_v4 (__uint128_t n)
{
  unsigned long a;
  a = (n >> 96);
  a += (n >> 64) & 0xffffffffULL;
  a += (n >> 32) & 0xffffffffULL;
  a += (n & 0xffffffffULL);
  return a % 3;
}

#define N 1000000

int main()
{
  __uint128_t *a;
  unsigned int s;
  unsigned long t1, t2;
  int fd;
  int i;
  a = malloc (sizeof (*a) * N);
  fd = open ("/dev/random", O_RDONLY);
  read (fd, a, sizeof (*a) * N);
  s = 0;
  t1 = __rdtsc();
  for (i=0; i<N; i++)
    s += r3_128u_v2(a[i]);
  t2 = __rdtsc();
  printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double)
N);

  s = 0;
  t1 = __rdtsc();
  for (i=0; i<N; i++)
    s += r3_128u_v3(a[i]);
  t2 = __rdtsc();
  printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double)
N);

  s = 0;
  t1 = __rdtsc();
  for (i=0; i<N; i++)
    s += r3_128u_v4(a[i]);
  t2 = __rdtsc();
  printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double)
N);

}

And here are the results on my box using -O3 -march=native:

s = 7 r3_128u_v2: 22.204618 cycles per iteration
s = 7 r3_128u_v2: 8.143544 cycles per iteration
s = 7 r3_128u_v2: 6.110718 cycles per iteration


More information about the Gcc-bugs mailing list