This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] improved algorithm for gcc/expmed.c::choose_multiplier()


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=28417

Right now Bugzilla internal problem prevents me from creating
an attachement there. So it goes here.

Not nice enough to go into release.
I still cannot figure out what precision is, so I restricted new code to
(n == HOST_BITS_PER_WIDE_INT && precision == HOST_BITS_PER_WIDE_INT) case.
Need help here.

Patched versus stock gcc-4.1.1 on this code:

unsigned v;
void f9188_mul365384439_shift27(unsigned A) { v = A/(unsigned)1577682821; }

# diff -u suboptimal-411-O2.s suboptimal-411-O2-fixed.s
--- suboptimal-411-O2.s 2006-07-29 19:34:09.000000000 +0200
+++ suboptimal-411-O2-fixed.s   2006-07-29 19:37:14.000000000 +0200
@@ -19,21 +19,10 @@
 f9188_mul365384439_shift27:
        pushl   %ebp
        movl    %esp, %ebp
-       pushl   %edi
-       pushl   %esi
-       pushl   %ebx
-       movl    8(%ebp), %ebx
-       movl    $1551183727, %ecx
-       movl    %ebx, %eax
-       mull    %ecx
-       subl    %edx, %ebx
-       shrl    %ebx
-       leal    (%ebx,%edx), %eax
-       shrl    $30, %eax
-       movl    %eax, v
-       popl    %ebx
-       popl    %esi
-       popl    %edi
+       movl    $365384439, %eax
+       mull    8(%ebp)
+       shrl    $27, %edx
+       movl    %edx, v
        leave
        ret
...

suboptimal.c is a test program. -O2 -S on it will give you the
optimized assembly, running it (compiled with -Os)
will check the correctness of the optimized code.
--
vda
New algorithm lives in separate function gcc_fastdiv_params() which
is called from choose_multiplier(). Then we run old algorithm.
Then we compare results and if new algorithm gives different one
(new result is always better or same than old), we use it.

I still cannot figure out what precision is, so I restricted new code to
(n == HOST_BITS_PER_WIDE_INT && precision == HOST_BITS_PER_WIDE_INT) case.
Need help here.

diff -urpN gcc-4.1.1.orig/gcc/expmed.c gcc-4.1.1.new/gcc/expmed.c
--- gcc-4.1.1.orig/gcc/expmed.c	2006-05-15 15:33:32.000000000 +0200
+++ gcc-4.1.1.new/gcc/expmed.c	2006-07-29 19:58:55.000000000 +0200
@@ -3224,6 +3224,128 @@ ceil_log2 (unsigned HOST_WIDE_INT x)
   return floor_log2 (x - 1) + 1;
 }
 
+
+/*
+[below: 'div' is unsigned integer division]
+['/' is real division with infinite precision]
+[A,B,C... - integers, a,b,c... - reals]
+
+Theory: we want to calculate A div B, fast.
+B is const > 2 and is not a power of 2.
+
+In fp: A/B = A*(L/B)/L. (If L is a large power of 2,
+say 2^32, then it could be done really fast).
+Let k := L/B, K := L div B + 1.
+Then A/B = A*k/L.
+
+Then this is true:
+
+A div B <= A * K div L.
+
+For small enough A: A div B = A * K div L.
+Lets find first A for which it is not true.
+
+Lets compare k/L and K/L. K/L is larger by a small value d:
+
+d := K/L - k/L = (L div B + 1) / L - L/B/L =
+= (L div B * B + B) / (L*B) - L/(L*B) =
+= (L div B * B + B - L) / (L*B)
+
+A*K/L is larger than A*k/L by A*d.
+
+A*k/L is closest to 'overflowing into next integer'
+when A = N*B-1. The 'deficit' with such A is exactly 1/B.
+If A*d >= 1/B, then A*K/L will 'overflow'.
+
+Thus bad_A >= 1/B / d = (1/B) * (L*B)/(L div B * B + B - L) =
+= L/(L div B * B + B - L). Then you need to 'walk up' to next
+A representable as N*B-1: bad_A2 = (bad_A div B) * B + B-1
+This is the first A which will have wrong result
+(i.e. for which (A*K div L) = (A div B)+1, not (A div B).
+
+Practical use.
+
+Suppose that A and B are 32-bit unsigned integers.
+
+Unfortunately, there exist only two B values in 3..2^32-1 range
+for which _any_ 32-bit unsigned A can be fast divided using L=2^32
+(because bad_A=2^32 and any A is less than that):
+
+B=641 K=6700417 d=1/2753074036736 bad_A=4294967296 A=unlimited
+B=6700417 K=641 d=1/28778071884562432 bad_A=4294967296 A=unlimited
+
+We need to use larger powers of 2 for L if we need to handle
+many more B's. */
+
+#define max_HOST_WIDE_INT ((unsigned HOST_WIDE_INT) (~(HOST_WIDE_INT)0))
+int gcc_fastdiv_params(unsigned HOST_WIDE_INT B, unsigned HOST_WIDE_INT max_A,
+		unsigned HOST_WIDE_INT *multiplier_ptr, int *shift_ptr);
+int gcc_fastdiv_params(unsigned HOST_WIDE_INT B, unsigned HOST_WIDE_INT max_A,
+		unsigned HOST_WIDE_INT *multiplier_ptr, int *shift_ptr)
+{
+	unsigned HOST_WIDE_INT K, last_K, d_LB, bad_A, udummy;
+	HOST_WIDE_INT high_A, dummy;
+	int lgdn;
+
+	if (!B || !((B-1)&B)) return 0; /* we won't work with powers of 2 */
+
+	/* L/B should fit into uint, so max L is floor_log2(B) << HOST_BITS_PER_WIDE_INT
+	** L is impicitly = 1 << (lgdn+HOST_BITS_PER_WIDE_INT) */
+	lgdn = floor_log2(B);
+
+	/* K = L/B + 1: */
+        div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, B,(HOST_WIDE_INT)0,
+			&K,&dummy, &udummy,&dummy);
+	K++;
+
+	/* There is not much point in multiplying by even number
+	** and then shifting right. Reduce K & L to avoid it: */
+	while (!(K & 1) && lgdn)
+		lgdn--, K >>= 1;
+
+	/* d_LB = ((L/B) * B + B - L) */
+	d_LB = K * B; /* since (HOST_WIDE_INT)L == 0, subtracting it can be optimized out */
+	/* bad_A = L / d_LB */
+        div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, d_LB,(HOST_WIDE_INT)0,
+			&bad_A,&high_A, &udummy,&dummy);
+
+	bad_A = (bad_A / B) * B; /* ... + B-1 later */
+	if(high_A || bad_A > max_HOST_WIDE_INT - (B-1) ) {
+		high_A = 1; /* we aren't interested much in true value if high_A!=0 */
+	}
+	else bad_A += B-1;
+
+	if (!high_A && bad_A <= max_A)
+		return 0; /* no suitable params exist :( */
+
+	/* K is good, but maybe we can reduce it? */
+	while(1) {
+		last_K = K;
+		if (--lgdn < 0)
+			break;
+		/* K = L/B + 1: */
+    		div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, B,(HOST_WIDE_INT)0,
+			&K,&dummy, &udummy,&dummy);
+		K++;
+		/* d_LB = ((L/B) * B + B - L) */
+		d_LB = K * B; /* since (HOST_WIDE_INT)L == 0, subtracting it can be optimized out */
+		/* bad_A = L / d_LB */
+		div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, d_LB,(HOST_WIDE_INT)0,
+			&bad_A,&high_A, &udummy,&dummy);
+		bad_A = (bad_A / B) * B;
+		if(high_A || bad_A > max_HOST_WIDE_INT - (B-1) )
+			high_A = 1; /* we aren't interested much in true value if high_A!=0 */
+		else bad_A += B-1;
+		if (!high_A && bad_A <= max_A)
+			break;
+	}
+	/* new params are bad, report last ones */
+	*multiplier_ptr = last_K;
+	*shift_ptr = lgdn+1;
+	return 1;
+}
+
+
 /* Choose a minimal N + 1 bit approximation to 1/D that can be used to
    replace division by D, and put the least significant N bits of the result
    in *MULTIPLIER_PTR and return the most significant bit.
@@ -3252,6 +3374,14 @@ choose_multiplier (unsigned HOST_WIDE_IN
   unsigned HOST_WIDE_INT nl, dummy1;
   HOST_WIDE_INT nh, dummy2;
 
+  int fastdiv_params;
+  unsigned HOST_WIDE_INT K;
+  int bits;
+
+  fastdiv_params = 0;
+  if(n == HOST_BITS_PER_WIDE_INT && precision == HOST_BITS_PER_WIDE_INT)
+    fastdiv_params = gcc_fastdiv_params(d, max_HOST_WIDE_INT, &K, &bits);
+
   /* lgup = ceil(log2(divisor)); */
   lgup = ceil_log2 (d);
 
@@ -3266,7 +3396,7 @@ choose_multiplier (unsigned HOST_WIDE_IN
   gcc_assert (pow != 2 * HOST_BITS_PER_WIDE_INT);
 
   /* mlow = 2^(N + lgup)/d */
- if (pow >= HOST_BITS_PER_WIDE_INT)
+  if (pow >= HOST_BITS_PER_WIDE_INT)
     {
       nh = (HOST_WIDE_INT) 1 << (pow - HOST_BITS_PER_WIDE_INT);
       nl = 0;
@@ -3321,6 +3451,12 @@ choose_multiplier (unsigned HOST_WIDE_IN
   else
     {
       *multiplier_ptr = GEN_INT (mhigh_lo);
+      if(fastdiv_params && (mhigh_hi || K!=mhigh_lo))
+        {
+          *post_shift_ptr = bits;
+          *multiplier_ptr = GEN_INT (K);
+          return 0;
+        }
       return mhigh_hi;
     }
 }
#include <stdint.h>
#include <stdio.h>

unsigned v;

void f9952_mul764333263_shift27(unsigned A) { v=A/754200792; }

void f9188_mul365384439_shift27(unsigned A) { v = A/(unsigned)1577682821; }
void f9188_mul365384439_shift27_prime(unsigned A) { v = (((uint64_t)A)*(unsigned)365384439) >> (27+32); }
void f8399_mul2283243215_shift29(unsigned A) { v = A/(unsigned)1009898111; }
void f8399_mul2283243215_shift29_prime(unsigned A) { v = (((uint64_t)A)*(unsigned)2283243215) >> (29+32); }
void f8267_mul2482476753_shift30(unsigned A) { v = A/(unsigned)1857695551; }
void f8267_mul2482476753_shift30_prime(unsigned A) { v = (((uint64_t)A)*(unsigned)2482476753) >> (30+32); }

/* Generated code is suboptimal (gcc 4.1.1):
void f9188_mul365384439_shift27(unsigned A) { v = A/1577682821; }
f9188_mul365384439_shift27:
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	16(%esp), %ebx
	movl	$1551183727, %ecx
	movl	%ebx, %eax
	mull	%ecx
	subl	%edx, %ebx
	shrl	%ebx
	leal	(%ebx,%edx), %eax
	shrl	$30, %eax
	movl	%eax, v
	popl	%ebx
	popl	%esi
	popl	%edi
	ret
void f8399_mul2283243215_shift29(unsigned A) { v = A/1009898111; }
f8399_mul2283243215_shift29:
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	16(%esp), %ebx
	movl	$271519133, %ecx
	movl	%ebx, %eax
	mull	%ecx
	subl	%edx, %ebx
	shrl	%ebx
	leal	(%ebx,%edx), %eax
	shrl	$29, %eax
	movl	%eax, v
	popl	%ebx
	popl	%esi
	popl	%edi
	ret
void f8267_mul2482476753_shift30(unsigned A) { v = A/1857695551; }
f8267_mul2482476753_shift30:
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	16(%esp), %ebx
	movl	$669986209, %ecx
	movl	%ebx, %eax
	mull	%ecx
	subl	%edx, %ebx
	shrl	%ebx
	leal	(%ebx,%edx), %eax
	shrl	$30, %eax
	movl	%eax, v
	popl	%ebx
	popl	%esi
	popl	%edi
	ret

These operations can be done like this:
f9188_mul365384439_shift27_prime:
	movl	$365384439, %eax
	mull	4(%esp)
	movl	%edx, %eax
	shrl	$27, %eax
	movl	%eax, v
	ret
f8399_mul2283243215_shift29_prime:
	movl	$-2011724081, %eax
	mull	4(%esp)
	movl	%edx, %eax
	shrl	$29, %eax
	movl	%eax, v
	ret
f8267_mul2482476753_shift30_prime:
	movl	$-1812490543, %eax
	mull	4(%esp)
	movl	%edx, %eax
	shrl	$30, %eax
	movl	%eax, v
	ret

(Why there is that silly movl %edx, %eax - that's another good question...)

The verification program is below. Was compiled with -Os
(in order to force gcc to use div insn - we want to do true
division for the purpose of correctness check!)
and didn't report a failure.
*/

int main()
{
	unsigned A=0,u,v;
	while(++A) {
		(A & 0xffff) || printf("A=%u\r",A);

		u = (((uint64_t)A)*(unsigned)365384439) >> (27+32);
		v = A / (unsigned)1577682821;
		if(u!=v) { printf("1: A=%u u=%u v=%u\n", A,u,v); return 0; }

		u = (((uint64_t)A)*(unsigned)2283243215) >> (29+32);
		v = A / (unsigned)1009898111;
		if(u!=v) { printf("2: A=%u u=%u v=%u\n", A,u,v); return 0; }

		u = (((uint64_t)A)*(unsigned)2482476753) >> (30+32);
		v  =A / (unsigned)1857695551;
		if(u!=v) { printf("3: A=%u u=%u v=%u\n", A,u,v); return 0; }
	}
	puts("");
	return 0;
}

Attachment: suboptimal-411-O2-fixed.s
Description: Binary data

Attachment: suboptimal-411-O2.s
Description: Binary data


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]