This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.
| Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
|---|---|---|
| Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
| Other format: | [Raw text] | |
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=28417
Right now Bugzilla internal problem prevents me from creating
an attachement there. So it goes here.
Not nice enough to go into release.
I still cannot figure out what precision is, so I restricted new code to
(n == HOST_BITS_PER_WIDE_INT && precision == HOST_BITS_PER_WIDE_INT) case.
Need help here.
Patched versus stock gcc-4.1.1 on this code:
unsigned v;
void f9188_mul365384439_shift27(unsigned A) { v = A/(unsigned)1577682821; }
# diff -u suboptimal-411-O2.s suboptimal-411-O2-fixed.s
--- suboptimal-411-O2.s 2006-07-29 19:34:09.000000000 +0200
+++ suboptimal-411-O2-fixed.s 2006-07-29 19:37:14.000000000 +0200
@@ -19,21 +19,10 @@
f9188_mul365384439_shift27:
pushl %ebp
movl %esp, %ebp
- pushl %edi
- pushl %esi
- pushl %ebx
- movl 8(%ebp), %ebx
- movl $1551183727, %ecx
- movl %ebx, %eax
- mull %ecx
- subl %edx, %ebx
- shrl %ebx
- leal (%ebx,%edx), %eax
- shrl $30, %eax
- movl %eax, v
- popl %ebx
- popl %esi
- popl %edi
+ movl $365384439, %eax
+ mull 8(%ebp)
+ shrl $27, %edx
+ movl %edx, v
leave
ret
...
suboptimal.c is a test program. -O2 -S on it will give you the
optimized assembly, running it (compiled with -Os)
will check the correctness of the optimized code.
--
vda
New algorithm lives in separate function gcc_fastdiv_params() which
is called from choose_multiplier(). Then we run old algorithm.
Then we compare results and if new algorithm gives different one
(new result is always better or same than old), we use it.
I still cannot figure out what precision is, so I restricted new code to
(n == HOST_BITS_PER_WIDE_INT && precision == HOST_BITS_PER_WIDE_INT) case.
Need help here.
diff -urpN gcc-4.1.1.orig/gcc/expmed.c gcc-4.1.1.new/gcc/expmed.c
--- gcc-4.1.1.orig/gcc/expmed.c 2006-05-15 15:33:32.000000000 +0200
+++ gcc-4.1.1.new/gcc/expmed.c 2006-07-29 19:58:55.000000000 +0200
@@ -3224,6 +3224,128 @@ ceil_log2 (unsigned HOST_WIDE_INT x)
return floor_log2 (x - 1) + 1;
}
+
+/*
+[below: 'div' is unsigned integer division]
+['/' is real division with infinite precision]
+[A,B,C... - integers, a,b,c... - reals]
+
+Theory: we want to calculate A div B, fast.
+B is const > 2 and is not a power of 2.
+
+In fp: A/B = A*(L/B)/L. (If L is a large power of 2,
+say 2^32, then it could be done really fast).
+Let k := L/B, K := L div B + 1.
+Then A/B = A*k/L.
+
+Then this is true:
+
+A div B <= A * K div L.
+
+For small enough A: A div B = A * K div L.
+Lets find first A for which it is not true.
+
+Lets compare k/L and K/L. K/L is larger by a small value d:
+
+d := K/L - k/L = (L div B + 1) / L - L/B/L =
+= (L div B * B + B) / (L*B) - L/(L*B) =
+= (L div B * B + B - L) / (L*B)
+
+A*K/L is larger than A*k/L by A*d.
+
+A*k/L is closest to 'overflowing into next integer'
+when A = N*B-1. The 'deficit' with such A is exactly 1/B.
+If A*d >= 1/B, then A*K/L will 'overflow'.
+
+Thus bad_A >= 1/B / d = (1/B) * (L*B)/(L div B * B + B - L) =
+= L/(L div B * B + B - L). Then you need to 'walk up' to next
+A representable as N*B-1: bad_A2 = (bad_A div B) * B + B-1
+This is the first A which will have wrong result
+(i.e. for which (A*K div L) = (A div B)+1, not (A div B).
+
+Practical use.
+
+Suppose that A and B are 32-bit unsigned integers.
+
+Unfortunately, there exist only two B values in 3..2^32-1 range
+for which _any_ 32-bit unsigned A can be fast divided using L=2^32
+(because bad_A=2^32 and any A is less than that):
+
+B=641 K=6700417 d=1/2753074036736 bad_A=4294967296 A=unlimited
+B=6700417 K=641 d=1/28778071884562432 bad_A=4294967296 A=unlimited
+
+We need to use larger powers of 2 for L if we need to handle
+many more B's. */
+
+#define max_HOST_WIDE_INT ((unsigned HOST_WIDE_INT) (~(HOST_WIDE_INT)0))
+int gcc_fastdiv_params(unsigned HOST_WIDE_INT B, unsigned HOST_WIDE_INT max_A,
+ unsigned HOST_WIDE_INT *multiplier_ptr, int *shift_ptr);
+int gcc_fastdiv_params(unsigned HOST_WIDE_INT B, unsigned HOST_WIDE_INT max_A,
+ unsigned HOST_WIDE_INT *multiplier_ptr, int *shift_ptr)
+{
+ unsigned HOST_WIDE_INT K, last_K, d_LB, bad_A, udummy;
+ HOST_WIDE_INT high_A, dummy;
+ int lgdn;
+
+ if (!B || !((B-1)&B)) return 0; /* we won't work with powers of 2 */
+
+ /* L/B should fit into uint, so max L is floor_log2(B) << HOST_BITS_PER_WIDE_INT
+ ** L is impicitly = 1 << (lgdn+HOST_BITS_PER_WIDE_INT) */
+ lgdn = floor_log2(B);
+
+ /* K = L/B + 1: */
+ div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, B,(HOST_WIDE_INT)0,
+ &K,&dummy, &udummy,&dummy);
+ K++;
+
+ /* There is not much point in multiplying by even number
+ ** and then shifting right. Reduce K & L to avoid it: */
+ while (!(K & 1) && lgdn)
+ lgdn--, K >>= 1;
+
+ /* d_LB = ((L/B) * B + B - L) */
+ d_LB = K * B; /* since (HOST_WIDE_INT)L == 0, subtracting it can be optimized out */
+ /* bad_A = L / d_LB */
+ div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, d_LB,(HOST_WIDE_INT)0,
+ &bad_A,&high_A, &udummy,&dummy);
+
+ bad_A = (bad_A / B) * B; /* ... + B-1 later */
+ if(high_A || bad_A > max_HOST_WIDE_INT - (B-1) ) {
+ high_A = 1; /* we aren't interested much in true value if high_A!=0 */
+ }
+ else bad_A += B-1;
+
+ if (!high_A && bad_A <= max_A)
+ return 0; /* no suitable params exist :( */
+
+ /* K is good, but maybe we can reduce it? */
+ while(1) {
+ last_K = K;
+ if (--lgdn < 0)
+ break;
+ /* K = L/B + 1: */
+ div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, B,(HOST_WIDE_INT)0,
+ &K,&dummy, &udummy,&dummy);
+ K++;
+ /* d_LB = ((L/B) * B + B - L) */
+ d_LB = K * B; /* since (HOST_WIDE_INT)L == 0, subtracting it can be optimized out */
+ /* bad_A = L / d_LB */
+ div_and_round_double(TRUNC_DIV_EXPR, 1, 0,1<<lgdn, d_LB,(HOST_WIDE_INT)0,
+ &bad_A,&high_A, &udummy,&dummy);
+ bad_A = (bad_A / B) * B;
+ if(high_A || bad_A > max_HOST_WIDE_INT - (B-1) )
+ high_A = 1; /* we aren't interested much in true value if high_A!=0 */
+ else bad_A += B-1;
+ if (!high_A && bad_A <= max_A)
+ break;
+ }
+ /* new params are bad, report last ones */
+ *multiplier_ptr = last_K;
+ *shift_ptr = lgdn+1;
+ return 1;
+}
+
+
/* Choose a minimal N + 1 bit approximation to 1/D that can be used to
replace division by D, and put the least significant N bits of the result
in *MULTIPLIER_PTR and return the most significant bit.
@@ -3252,6 +3374,14 @@ choose_multiplier (unsigned HOST_WIDE_IN
unsigned HOST_WIDE_INT nl, dummy1;
HOST_WIDE_INT nh, dummy2;
+ int fastdiv_params;
+ unsigned HOST_WIDE_INT K;
+ int bits;
+
+ fastdiv_params = 0;
+ if(n == HOST_BITS_PER_WIDE_INT && precision == HOST_BITS_PER_WIDE_INT)
+ fastdiv_params = gcc_fastdiv_params(d, max_HOST_WIDE_INT, &K, &bits);
+
/* lgup = ceil(log2(divisor)); */
lgup = ceil_log2 (d);
@@ -3266,7 +3396,7 @@ choose_multiplier (unsigned HOST_WIDE_IN
gcc_assert (pow != 2 * HOST_BITS_PER_WIDE_INT);
/* mlow = 2^(N + lgup)/d */
- if (pow >= HOST_BITS_PER_WIDE_INT)
+ if (pow >= HOST_BITS_PER_WIDE_INT)
{
nh = (HOST_WIDE_INT) 1 << (pow - HOST_BITS_PER_WIDE_INT);
nl = 0;
@@ -3321,6 +3451,12 @@ choose_multiplier (unsigned HOST_WIDE_IN
else
{
*multiplier_ptr = GEN_INT (mhigh_lo);
+ if(fastdiv_params && (mhigh_hi || K!=mhigh_lo))
+ {
+ *post_shift_ptr = bits;
+ *multiplier_ptr = GEN_INT (K);
+ return 0;
+ }
return mhigh_hi;
}
}
#include <stdint.h>
#include <stdio.h>
unsigned v;
void f9952_mul764333263_shift27(unsigned A) { v=A/754200792; }
void f9188_mul365384439_shift27(unsigned A) { v = A/(unsigned)1577682821; }
void f9188_mul365384439_shift27_prime(unsigned A) { v = (((uint64_t)A)*(unsigned)365384439) >> (27+32); }
void f8399_mul2283243215_shift29(unsigned A) { v = A/(unsigned)1009898111; }
void f8399_mul2283243215_shift29_prime(unsigned A) { v = (((uint64_t)A)*(unsigned)2283243215) >> (29+32); }
void f8267_mul2482476753_shift30(unsigned A) { v = A/(unsigned)1857695551; }
void f8267_mul2482476753_shift30_prime(unsigned A) { v = (((uint64_t)A)*(unsigned)2482476753) >> (30+32); }
/* Generated code is suboptimal (gcc 4.1.1):
void f9188_mul365384439_shift27(unsigned A) { v = A/1577682821; }
f9188_mul365384439_shift27:
pushl %edi
pushl %esi
pushl %ebx
movl 16(%esp), %ebx
movl $1551183727, %ecx
movl %ebx, %eax
mull %ecx
subl %edx, %ebx
shrl %ebx
leal (%ebx,%edx), %eax
shrl $30, %eax
movl %eax, v
popl %ebx
popl %esi
popl %edi
ret
void f8399_mul2283243215_shift29(unsigned A) { v = A/1009898111; }
f8399_mul2283243215_shift29:
pushl %edi
pushl %esi
pushl %ebx
movl 16(%esp), %ebx
movl $271519133, %ecx
movl %ebx, %eax
mull %ecx
subl %edx, %ebx
shrl %ebx
leal (%ebx,%edx), %eax
shrl $29, %eax
movl %eax, v
popl %ebx
popl %esi
popl %edi
ret
void f8267_mul2482476753_shift30(unsigned A) { v = A/1857695551; }
f8267_mul2482476753_shift30:
pushl %edi
pushl %esi
pushl %ebx
movl 16(%esp), %ebx
movl $669986209, %ecx
movl %ebx, %eax
mull %ecx
subl %edx, %ebx
shrl %ebx
leal (%ebx,%edx), %eax
shrl $30, %eax
movl %eax, v
popl %ebx
popl %esi
popl %edi
ret
These operations can be done like this:
f9188_mul365384439_shift27_prime:
movl $365384439, %eax
mull 4(%esp)
movl %edx, %eax
shrl $27, %eax
movl %eax, v
ret
f8399_mul2283243215_shift29_prime:
movl $-2011724081, %eax
mull 4(%esp)
movl %edx, %eax
shrl $29, %eax
movl %eax, v
ret
f8267_mul2482476753_shift30_prime:
movl $-1812490543, %eax
mull 4(%esp)
movl %edx, %eax
shrl $30, %eax
movl %eax, v
ret
(Why there is that silly movl %edx, %eax - that's another good question...)
The verification program is below. Was compiled with -Os
(in order to force gcc to use div insn - we want to do true
division for the purpose of correctness check!)
and didn't report a failure.
*/
int main()
{
unsigned A=0,u,v;
while(++A) {
(A & 0xffff) || printf("A=%u\r",A);
u = (((uint64_t)A)*(unsigned)365384439) >> (27+32);
v = A / (unsigned)1577682821;
if(u!=v) { printf("1: A=%u u=%u v=%u\n", A,u,v); return 0; }
u = (((uint64_t)A)*(unsigned)2283243215) >> (29+32);
v = A / (unsigned)1009898111;
if(u!=v) { printf("2: A=%u u=%u v=%u\n", A,u,v); return 0; }
u = (((uint64_t)A)*(unsigned)2482476753) >> (30+32);
v =A / (unsigned)1857695551;
if(u!=v) { printf("3: A=%u u=%u v=%u\n", A,u,v); return 0; }
}
puts("");
return 0;
}
Attachment:
suboptimal-411-O2-fixed.s
Description: Binary data
Attachment:
suboptimal-411-O2.s
Description: Binary data
| Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
|---|---|---|
| Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |