IEEE mandates correct rounding of square roots, and sqrtq rounds incorrectly for some values: #include <stdio.h> #include <gmp.h> #define MPFR_WANT_FLOAT128 #include <mpfr.h> #include <quadmath.h> #define NDIG 113 int main() { mpfr_t a, b, tst; gmp_randstate_t state; __float128 af, bf; gmp_randinit_mt (state); mpfr_set_default_prec (NDIG); mpfr_init (a); mpfr_init (b); mpfr_init (tst); for (int i=0; i<100; i++) { mpfr_urandom (a, state, MPFR_RNDN); mpfr_mul_ui (a, a, 3, MPFR_RNDN); mpfr_add_ui (a, a, 1, MPFR_RNDN); af = mpfr_get_float128 (a, MPFR_RNDN); mpfr_sqrt (b, a, MPFR_RNDN); bf = sqrtq (af); mpfr_set_float128 (tst, bf, MPFR_RNDN); if (!mpfr_equal_p (tst, b)) { printf ("sqrt("); mpfr_out_str (stdout, 16, 0, a, MPFR_RNDN); printf (") = "); mpfr_out_str (stdout, 16, 0, b, MPFR_RNDN); printf (" ! = "); mpfr_out_str (stdout, 16, 0, tst, MPFR_RNDN); printf ("\n"); } } return 0; } gives me sqrt(2.4e5658b5f6d2a1ec21e3829dd7f8) = 1.84bfedcba255aeaf7e99184e6f3b ! = 1.84bfedcba255aeaf7e99184e6f3a sqrt(1.a8942b9fcf73a7cc6d2546104e8f) = 1.49af594bb6630a9358d929e60a7b ! = 1.49af594bb6630a9358d929e60a7c sqrt(2.71ad677c998ff96c3a5af38b7ee0) = 1.9037796dd2ced85e3fde2112fe63 ! = 1.9037796dd2ced85e3fde2112fe62 sqrt(3.b8f47bfc809dc00ed76e72cc70a4) = 1.edeb6523391e23008db6386115d3 ! = 1.edeb6523391e23008db6386115d4 sqrt(1.48524d337f27113e25d7eee643b7) = 1.21ea0f970722185990c1a32be807 ! = 1.21ea0f970722185990c1a32be806 sqrt(1.c9e6c13fef31a052f5dba4c0d05e) = 1.5660ca59a7ad02740d08ec0ad237 ! = 1.5660ca59a7ad02740d08ec0ad236 sqrt(2.6c28926057d24449c618b8addbb4) = 1.8e729ca4cf7f6ab6f8cf2579a87b ! = 1.8e729ca4cf7f6ab6f8cf2579a87c sqrt(1.51c65cf9647952785b859500bf70) = 1.260ef5983614564e3fa636e0d493 ! = 1.260ef5983614564e3fa636e0d492 sqrt(2.cf6052559128965a96f542a8c462) = 1.ad239894bbd64edaa5de12f61265 ! = 1.ad239894bbd64edaa5de12f61264 sqrt(2.bfe5732cc879053f71c5240d026e) = 1.a87f27daa19d145bb1d2756750e5 ! = 1.a87f27daa19d145bb1d2756750e4 sqrt(3.66185a13cbb8d455463507813f6c) = 1.d7f53f5b2ad068de705fe4660697 ! = 1.d7f53f5b2ad068de705fe4660698 sqrt(2.5af21489bafdf81c0047850a69e0) = 1.88e114747ee2213bc2af2c6f572f ! = 1.88e114747ee2213bc2af2c6f572e sqrt(2.71c155bcdb555657ff0fe301660a) = 1.903dd936eb27661ee030b952dc2f ! = 1.903dd936eb27661ee030b952dc2e sqrt(1.74ed7ee5a244f1c8a45361664fb0) = 1.34fb3bfc3d55ca48fbffb9be113f ! = 1.34fb3bfc3d55ca48fbffb9be1140 sqrt(2.414fd19bd0495ab521274e316538) = 1.806fe03d3244345277e674340bdb ! = 1.806fe03d3244345277e674340bda sqrt(2.4853c8d694a5aa889070fdab21c6) = 1.82c40b824e75149ee39e9b1c8fed ! = 1.82c40b824e75149ee39e9b1c8fec sqrt(1.e6ae9218331623b1a8b2ceea4b22) = 1.60f95131df63442f9ba389039d55 ! = 1.60f95131df63442f9ba389039d54 sqrt(1.4dff0457856b9f2097275f9decbe) = 1.2468b38405610910a60929f0c895 ! = 1.2468b38405610910a60929f0c896 sqrt(3.e720f9998891f40b8a3d2207eb6c) = 1.f9be75953c96a035da06ccdc1e67 ! = 1.f9be75953c96a035da06ccdc1e66 sqrt(1.98d4defa4bf711b662d1a2bca893) = 1.43836938d7cb1c9cadaa6d69f11f ! = 1.43836938d7cb1c9cadaa6d69f11e sqrt(3.f2e46bf92ba492c9980ddfa4317e) = 1.fcb6674f25a5dc44b4b702a34d13 ! = 1.fcb6674f25a5dc44b4b702a34d14 sqrt(1.24e5a1865c0cbb6483d9ccd2ced0) = 1.11d3e6bd82323006686745d16fa1 ! = 1.11d3e6bd82323006686745d16fa0 sqrt(3.937ee06d6e7d373cf1d95f09e500) = 1.e41d51bcc6c6a8867a5819bb1a1b ! = 1.e41d51bcc6c6a8867a5819bb1a1c sqrt(2.83faa7963fffa7fbc9e246e0914e) = 1.96072460dd2c31680ca207682e4f ! = 1.96072460dd2c31680ca207682e50 sqrt(1.fdf4c95075ffdccec60afb6a74fb) = 1.6950bb2977940a3c7b6524ac62a5 ! = 1.6950bb2977940a3c7b6524ac62a4
Trying #include <math.h> #include <stdio.h> int main () { volatile long double ld = 0x2.4e5658b5f6d2a1ec21e3829dd7f8p+0L; ld = sqrtl (ld); printf ("%.50La\n", ld); return 0; } on s390x-linux shows: 0x1.84bfedcba255aeaf7e99184e6f3b0000000000000000000000p+0 but that is implemented using a hw instruction. #include <quadmath.h> #include <stdio.h> int main () { volatile __float128 ld = 0x2.4e5658b5f6d2a1ec21e3829dd7f8p+0Q; ld = sqrtq (ld); char buf[70]; quadmath_snprintf (buf, 69, "%.50Qa", ld); printf ("%s\n", buf); return 0; } on x86_64 indeed prints ...6f3a The first testcase also prints ...6f3b on powerpc64le-linux, but again that doesn't say much, because powerpc64le-linux __ieee754_sqrtf128 comes from sysdeps/powerpc/powerpc64/le/fpu/e_sqrtf128.c where in my case it is implemented using soft-fp (and when glibc is configured for power9 or later using hw instruction). Seems libquadmath sqrtq.c isn't based on any glibc code, instead it uses sqrt or sqrtl for initial approximation and then /* Two Newton iterations. */ y -= 0.5q * (y - x / y); y -= 0.5q * (y - x / y); (or for sqrtl one iteration). I bet that doesn't and can't guarantee correct rounding. So, do we need to switch to soft-fp based implementation for it (we have a copy already in libgcc/soft-fp), or do we have other options?
(In reply to Jakub Jelinek from comment #1) > So, do we need to switch to soft-fp based implementation for it (we have a > copy already in libgcc/soft-fp), or do we have other options? https://cgit.freebsd.org/src/tree/lib/msun/src/e_sqrtl.c The above worked on FreeBSD-sparc64. Support for sparc64 has been dropped by FreeBSD. It is used on at least FreeBSd-aarch64, but I've never tested it there.
(In reply to kargl from comment #2) > (In reply to Jakub Jelinek from comment #1) > > > So, do we need to switch to soft-fp based implementation for it (we have a > > copy already in libgcc/soft-fp), or do we have other options? > > > https://cgit.freebsd.org/src/tree/lib/msun/src/e_sqrtl.c > > The above worked on FreeBSD-sparc64. Support for > sparc64 has been dropped by FreeBSD. It is used on > at least FreeBSd-aarch64, but I've never tested it > there. Forgot to mention. See long comment at end of https://cgit.freebsd.org/src/tree/lib/msun/src/e_sqrt.c
If you want quick fix for immediate shipment then you can take that: #include <math.h> #include <quadmath.h> __float128 quick_and_dirty_sqrtq(__float128 x) { if (isnanq(x)) return x; if (x==0) return x; if (x < 0) return nanq(""); if (isinfq(x)) return x; int xExp; x = frexpq(x, &xExp); if (xExp & 1) x *= 2.0; // x in [0.5:2.0) __float128 r = (__float128)(1.0/sqrt((double)x)); // r=rsqrt(x) estimate (53 bits) r *= 1.5 - r*r*x*0.5; // NR iteration improves precision of r to 105.4 bit __float128 y = x*r; // y=sqrt(x) estimate (105.4 bits) // extended-precision NR iteration __float128 yH = (double)y; __float128 yL = y - yH; __float128 deltaX = x - yH*yH; deltaX -= yH*yL*2; deltaX -= yL*yL; y += deltaX*r*0.5; // improve precision of y to ~210.2 bits. Not enough for perfect rounding, but not too bad return ldexpq(y, xExp >> 1); } It is very slow, even slower than what you have now, which by itself is quite astonishingly slow. It is also not sufficiently precise for correct rounding in all cases. But, at least, the worst error is something like (0.5+2**-98) ULP, so you are unlikely to be ever caught by black box type of testing. It's biggest advantage is extreme portability. Should run on all platforms where double==IEEE binary64 and __float128 == IEEE binary128. May be, few days later I'll have better variant for "good" 64-bit platforms i.e. for those where we have __int128. It would be 15-25 times faster than the variant above and rounding would be mathematically correct rather than just "impossible to be caught" like above. But it would not run everywhere. Also, I want to give it away under MIT or BSD license, rather than under GPL.
There is another, much worse, problem, reported and analyzed by "Michael S" on comp.arch. The code has #ifdef HAVE_SQRTL { long double xl = (long double) x; if (xl <= LDBL_MAX && xl >= LDBL_MIN) { /* Use long double result as starting point. */ y = (__float128) sqrtl (xl); /* One Newton iteration. */ y -= 0.5q * (y - x / y); return y; } } #endif which assumes that long double has a higher precision than normal double. On x86_64, this depends o the settings of the FPU flags, so a number like 0x1.06bc82f7b9d71dfcbddf2358a0eap-1024 is corrected with 32 ULP of error because there is only a single round of Newton iterations if the FPU flags are set to normal precision. I believe we can at least fix that before the gcc 12 release, by simply removing the code I quoted.
(In reply to Thomas Koenig from comment #5) > There is another, much worse, problem, reported and analyzed by "Michael S" > on comp.arch. The code has > > #ifdef HAVE_SQRTL > { > long double xl = (long double) x; > if (xl <= LDBL_MAX && xl >= LDBL_MIN) > { > /* Use long double result as starting point. */ > y = (__float128) sqrtl (xl); > > /* One Newton iteration. */ > y -= 0.5q * (y - x / y); > return y; > } > } > #endif > > which assumes that long double has a higher precision than > normal double. On x86_64, this depends o the settings of the > FPU flags, so a number like 0x1.06bc82f7b9d71dfcbddf2358a0eap-1024 > is corrected with 32 ULP of error because there is only a single > round of Newton iterations if the FPU flags are set to normal precision. That is only a problem on OSes that do that, I think mainly BSDs, no? On Linux it should be fine (well, still not 0.5ulp precise, but not as bad as when sqrtl is just double precision precise).
(In reply to Jakub Jelinek from comment #6) > (In reply to Thomas Koenig from comment #5) > > There is another, much worse, problem, reported and analyzed by "Michael S" > > on comp.arch. The code has > > > > #ifdef HAVE_SQRTL > > { > > long double xl = (long double) x; > > if (xl <= LDBL_MAX && xl >= LDBL_MIN) > > { > > /* Use long double result as starting point. */ > > y = (__float128) sqrtl (xl); > > > > /* One Newton iteration. */ > > y -= 0.5q * (y - x / y); > > return y; > > } > > } > > #endif > > > > which assumes that long double has a higher precision than > > normal double. On x86_64, this depends o the settings of the > > FPU flags, so a number like 0x1.06bc82f7b9d71dfcbddf2358a0eap-1024 > > is corrected with 32 ULP of error because there is only a single > > round of Newton iterations if the FPU flags are set to normal precision. > > That is only a problem on OSes that do that, I think mainly BSDs, no? Correct. > On Linux it should be fine (well, still not 0.5ulp precise, but not as bad > as when sqrtl is just double precision precise). In this case, it was discovered on some version of WSL.
On Sat, Apr 09, 2022 at 10:23:39AM +0000, jakub at gcc dot gnu.org wrote: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105101 > > --- Comment #6 from Jakub Jelinek <jakub at gcc dot gnu.org> --- > (In reply to Thomas Koenig from comment #5) > > There is another, much worse, problem, reported and analyzed by "Michael S" > > on comp.arch. The code has > > > > #ifdef HAVE_SQRTL > > { > > long double xl = (long double) x; > > if (xl <= LDBL_MAX && xl >= LDBL_MIN) > > { > > /* Use long double result as starting point. */ > > y = (__float128) sqrtl (xl); > > > > /* One Newton iteration. */ > > y -= 0.5q * (y - x / y); > > return y; > > } > > } > > #endif > > > > which assumes that long double has a higher precision than > > normal double. On x86_64, this depends o the settings of the > > FPU flags, so a number like 0x1.06bc82f7b9d71dfcbddf2358a0eap-1024 > > is corrected with 32 ULP of error because there is only a single > > round of Newton iterations if the FPU flags are set to normal precision. > > That is only a problem on OSes that do that, I think mainly BSDs, no? > On Linux it should be fine (well, still not 0.5ulp precise, but not as bad as > when sqrtl is just double precision precise). > i686-*-freebsd sets the FPU to have 53 bits of precision for long double. It has the usual exponent range of an Intel 80-bit extended double.
(In reply to Michael_S from comment #4) > If you want quick fix for immediate shipment then you can take that: > > #include <math.h> > #include <quadmath.h> > > __float128 quick_and_dirty_sqrtq(__float128 x) > { > if (isnanq(x)) > return x; > > if (x==0) > return x; > > if (x < 0) > return nanq(""); > > if (isinfq(x)) > return x; > > int xExp; > x = frexpq(x, &xExp); > if (xExp & 1) > x *= 2.0; // x in [0.5:2.0) > > __float128 r = (__float128)(1.0/sqrt((double)x)); // r=rsqrt(x) estimate > (53 bits) > r *= 1.5 - r*r*x*0.5; // NR iteration improves precision of r to 105.4 bit > > __float128 y = x*r; // y=sqrt(x) estimate (105.4 bits) > > // extended-precision NR iteration > __float128 yH = (double)y; > __float128 yL = y - yH; > > __float128 deltaX = x - yH*yH; > deltaX -= yH*yL*2; > deltaX -= yL*yL; > y += deltaX*r*0.5; // improve precision of y to ~210.2 bits. Not enough > for perfect rounding, but not too bad > > return ldexpq(y, xExp >> 1); > } > > It is very slow, even slower than what you have now, which by itself is > quite astonishingly slow. > It is also not sufficiently precise for correct rounding in all cases. > But, at least, the worst error is something like (0.5+2**-98) ULP, so you are > unlikely to be ever caught by black box type of testing. > It's biggest advantage is extreme portability. > Should run on all platforms where double==IEEE binary64 and __float128 == > IEEE binary128. > > May be, few days later I'll have better variant for "good" 64-bit platforms > i.e. for those where we have __int128. > It would be 15-25 times faster than the variant above and rounding would be > mathematically correct rather than just "impossible to be caught" like above. > But it would not run everywhere. > Also, I want to give it away under MIT or BSD license, rather than under GPL. Here. #include <stdint.h> #include <string.h> #include <quadmath.h> static __inline uint64_t umulx64(uint64_t mult1, uint64_t mult2, int rshift) { return (uint64_t)(((unsigned __int128)mult1 * mult2)>>rshift); } static __inline uint64_t umulh64(uint64_t mult1, uint64_t mult2) { return umulx64(mult1, mult2, 64); } static __inline __float128 ret__float128(uint64_t wordH, uint64_t wordL) { unsigned __int128 res_u128 = ((unsigned __int128)wordH << 64) | wordL; __float128 result; memcpy(&result, &res_u128, sizeof(result)); // hopefully __int128 and __float128 have the endianness return result; } __float128 slow_and_clean_sqrtq(__float128 src) { typedef unsigned __int128 __uint128; const uint64_t SIGN_BIT = (uint64_t)1 << 63; const uint64_t MANT_H_MSK = (uint64_t)-1 >> 16; const uint64_t nNAN_MSW = (uint64_t)0xFFFF8 << 44; // -NaN __uint128 src_u128; memcpy(&src_u128, &src, sizeof(src_u128)); uint64_t mshalf = (uint64_t)(src_u128 >> 64); // MS half uint64_t mantL = (uint64_t)(src_u128); // LS half // parse MS half of source unsigned exp = mshalf >> 48; uint64_t mantH = mshalf & MANT_H_MSK; unsigned expx = exp + 0x3FFF; if (exp - 1 >= 0x7FFF-1) { // special cases: exp = 0, exp=max_exp or sign=1 if (exp > 0x7fff) { // negative if (exp==0xFFFF) { // NaN or -Inf if ((mantH | mantL)==0) // -Inf mshalf = nNAN_MSW; return ret__float128(mshalf, mantL); // in case of NaN the leave number intact } if (mshalf==SIGN_BIT && mantL==0) // -0 return ret__float128(mshalf, mantL); // in case of -0 leave the number intact // normal or sub-normal mantL = 0; mshalf = nNAN_MSW; return ret__float128(mshalf, mantL); } // non-negative if (exp == 0x7fff) // NaN or -Inf return ret__float128(mshalf, mantL); // in case of positive NaN or Inf leave the number intact // exp==0 : zero or subnormal if ((mantH | mantL)==0) // +0 return ret__float128(mshalf, mantL); // in case of +0 leave the number intact // positive subnormal // normalize if (mantH == 0) { expx -= 48; int lz = __builtin_clzll(mantL); expx -= lz; mantL <<= lz + 1; mantH = mantL >> 16; mantL = mantL << 48; } else { int lz = __builtin_clzll(mantH)-16; expx -= lz; mantH = (mantH << (lz+1)) | mantL >> (63-lz); mantL = mantL << (lz + 1); } mantH &= MANT_H_MSK; } // Normal case int e = expx & 1; // e= LS bit of exponent const uint64_t BIT_30 = (uint64_t)1 << 30; static const uint8_t rsqrt_tab[64] = { // floor(1/sqrt(1+i/32)*512) - 256 252, 248, 244, 240, 237, 233, 230, 226, 223, 220, 216, 213, 210, 207, 204, 201, 199, 196, 193, 190, 188, 185, 183, 180, 178, 175, 173, 171, 168, 166, 164, 162, 159, 157, 155, 153, 151, 149, 147, 145, 143, 141, 139, 138, 136, 134, 132, 131, 129, 127, 125, 124, 122, 121, 119, 117, 116, 114, 113, 111, 110, 108, 107, 106, }; // Look for approximate value of rsqrt(x)=1/sqrt(x) // where x = 1:mantH:mantL unsigned r0 = rsqrt_tab[mantH >> (48-6)]+256; // scale = 2**9 // r0 is low estimate, r0*sqrt(x) is in range [0.99119507..0.99991213] // Est. Precisions: ~6.82 bits // // Improve precision of the estimate by 2nd-order NR iteration with custom polynomial // r1 = r0 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0) // where // P2 = 0.38345164828933775 // P1 = -1.26684607889193623 // P0 = 1.88339443966540210 // Calculations are implemented in a way that minimizes latency // and minimizes requirement to precision of the coefficients as // ((rrx0_n + A)**2 + B)*(r0*F[e]) // where rrx0_n = 1 - r0*r0*x // // The last multiplications serves additional purpose of adjustment by exponent (multiplication by sqrt(0.5)) // Only upper 30 bits of mantissa used in evaluation. Use of minimal amount of input bits // simplifies validation by exhausting with minimal negative effect on worst-case precision // uint32_t rrx0 = ((r0*r0 * ((mantH >> 18)+BIT_30+1))>>16)+1;// scale = 2**32 // rrx0 calculated with x1_u=x/2**34 + 1 instead of x1=x/2**34, in order to assure that // for any possible values of 34 LS bits of x, the estimate r1 is lower than exact value of r const uint32_t A = 2799880908; // scale = 2**32 const uint32_t B = 2343892134; // scale = 2**30 static uint32_t const f_tab[2] = { 3293824581, 2329085697 }; // F/sqrt(2**i), scale = 2**33 uint32_t rrx0_n = 0 - rrx0; // scale = 2**32, nbits=27 uint32_t termA = rrx0_n + A; // scale = 2**32, nbits=32 uint64_t termAA = ((uint64_t)termA * termA) >> 34; // scale = 2**30, nbits=30 uint64_t termAB = termAA + B; // scale = 2**30, nbits=32 uint64_t termRF = (r0*(uint64_t)f_tab[e]) >> 9; // scale = 2**33, nbits=32 uint64_t r1 = (termAB*termRF)>>33; // scale = 2**30; nbits=30 // r1 is low estimate // r1 is in range [0x1ffffffe .. 0x3fffffe1] // r1*sqrt(x) is in range [0.999_999_891_641_538_03..0.999_999_999_782_706_57], i.e. // Est. Precisions: ~23.13 bits // // Further improve precision of the estimate by canonical 2nd-order NR iteration // r2 = r1 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0) // where // P2 = 3/8 // P1 = -5/4 // P0 = 15/8 // At this point precision is of high importance, so evaluation is done in a way // that minimizes impact of truncation while still doing majority of the work with // efficient 64-bit arithmetic. // The transformed equation is r2 = r2 + r2*rrx1_n*(rrx1_n*3 + 4)/8 // where rrx1_n = 1 - r1*r1*x // uint64_t mant64 = (mantH << 16) | (mantL >> 48); // Upper 64 bits of mantissa uint64_t r1r1 = (r1*r1) << e; // scale = 2**60; nbits=60 __uint128 rrx1_x = (__uint128)r1r1*mant64 + r1r1; // r1*r1*(x+1), scale 2**124 uint64_t rrx1 = (uint64_t)(rrx1_x>>53)+(r1r1<<11)+1; // r1*r1*x , scale = 2**71; nbits=64 // rrx1 calculated with x2_u=x+1 instead of x2=x, in order to assure that // for any possible values of 48 LS bits of x, the estimates r2 and y2 are lower than exact values of r and y uint64_t rrx1_n = 0 - rrx1; // rrx1_n=1-r1^2*x , scale = 2**71, nbits=49 uint64_t term1 = rrx1_n; // rrx1_n*(1/2) , scale = 2**72, nbits=49 uint64_t term2 = umulh64(rrx1_n*3,rrx1_n) >> 9; // rrx1_n*rrx1_n*(3/8), scale = 2**72, nbits=27 uint64_t deltaR = umulh64(r1<<26,term1+term2); // (rrx1_n*1/2+rrx1_n*rrx1_n*(3/8))*r1, scale = 2**64, nbits=41 uint64_t r2 = (r1 << 34)+deltaR; // scale = 2**64, nbits=64 // r2 is low estimate, // r2 is in range [0x8000000000000000..0xffffffffffffffff] // r2*sqrt(x) is in range [0.999_999_999_999_999_999_888_676 .. 0.999_999_999_999_999_999_999_999_989] // Est. Precisions: ~62.96 bits // Worst-case errors are caused by unlucky truncation of cases where exact result scaled by 2**64 is in form N+eps // Calculate y = estimate of sqrt(1+x)-1 = x*rsqrt(x)+rsqrt(x) uint64_t y2 = (umulh64(mant64, r2) + r2) << e; // scale = 2**64, nbits=64 // scale = 2**64, nbits=64 if (r2 >= (uint64_t)(-2)) y2 = 0; // // Improve precision of y by 1-st order NR iteration // It is a canonical NR iteration y = (x + y*y)/(2*y) used // in a form y = y + (x - y*y)*r/2 // // Calculate bits[-60:-123] of dX = (x+1)*2**e - (y+1)*(y+1) // All upper bits of difference are known to be 0 __uint128 y2y2 = (__uint128)y2 * y2; // y*y, scale 2**128 uint64_t yy2 = (uint64_t)(y2y2 >> 5)+(y2<<60);// y*y+2*y, scale 2**123 uint64_t dx = ((mantL<<e)<<11) - yy2; // scale 2**123 dx -= (dx != 0); // subtract 1 in order to guarantee that estimate is lower than exact value uint64_t deltaY = umulh64(dx, r2); // (x - y*y)*r/2, scale 2**124 // Round deltaY to scale = 2**112 const uint64_t GUARD_MSK = (1 << 12)-1; // scale 2**124 const uint64_t GUARD_BIT = 1 << 11; // scale 2**124 const uint64_t MAX_ERR = 6; // scale 2**124 uint64_t round_incr = GUARD_BIT; uint64_t guard_bits = deltaY & GUARD_MSK; if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) { // Guard bits are very close to the half-bit (from below) // Direction of the rounding should be found by additional calculations __uint128 midY = (((__uint128)y2 << 49)+(deltaY>>11)) | 1 | ((__uint128)1 << 113); // scale 2**113 // MidY resides between two representable output points __uint128 rndDir = midY*midY; // scale 2**226 uint64_t rndDirH = (uint64_t)(rndDir >> 64); // scale 2**162 if ((rndDirH >> (162-113+e)) & 1) round_incr = GUARD_MSK; // when guard bit of the square = 1, it means that the square of midY < x, so we need to round up } deltaY = (deltaY + round_incr) >> 12; // scale 2**112 __uint128 y = ((__uint128)y2 << 48) + deltaY; // scale 2**112 // format output mantL = (uint64_t)(y); mantH = (uint64_t)(y >> 64) & MANT_H_MSK; uint64_t resExp = expx / 2; mshalf = (resExp << 48) | mantH; return ret__float128(mshalf, mantL); } Hopefully, this one is perfectly precise. On Intel (IvyBridge, Haswell, Skylake) it is quite fast. On AMD (Zen3) - less so. I didn't figure out yet, what causes a slowdown. But even on AMD, it's slow only relatively to the same code on Intel. Relatively to current library it is 7.4 times faster. Also, I hope that this post could be considered as giving a code away to any takers with BSD-like license. If it is not, then tell me, how to do it right.
BTW, the same ideas as in the code above could improve speed of division operation (on modern 64-bit HW) by factor of 3 (on Intel) or 2 (on AMD).
(In reply to Michael_S from comment #10) > BTW, the same ideas as in the code above could improve speed of division > operation (on modern 64-bit HW) by factor of 3 (on Intel) or 2 (on AMD). Did it. On Intel it's even better than anticipated. 5x speedup on Haswell and Skylake, 4.5x on Ivy Bridge. Unfortunately, right now I have no connection to my Zen3 test system, so can't measure on it with final variant. But according to preliminary tests the speedup should be slightly better than 2x.
(In reply to Michael_S from comment #11) > (In reply to Michael_S from comment #10) > > BTW, the same ideas as in the code above could improve speed of division > > operation (on modern 64-bit HW) by factor of 3 (on Intel) or 2 (on AMD). > > Did it. > On Intel it's even better than anticipated. 5x speedup on Haswell and > Skylake, > 4.5x on Ivy Bridge. > Unfortunately, right now I have no connection to my Zen3 test system, so > can't > measure on it with final variant. But according to preliminary tests the > speedup should be slightly better than 2x. O.k. It seems now I fixed all details of rounding. Including cases of sub-normal results. Even ties now appear to be broken to even, as prescribed. It can take a bit more polish, esp. a comments, but even in this form much better than what you have now. #include <stdint.h> #include <string.h> #include <quadmath.h> static __inline uint64_t umulx64(uint64_t mult1, uint64_t mult2, int rshift) { return (uint64_t)(((unsigned __int128)mult1 * mult2)>>rshift); } static __inline uint64_t umulh64(uint64_t mult1, uint64_t mult2) { return umulx64(mult1, mult2, 64); } static __inline void set__float128(__float128* dst, uint64_t wordH, uint64_t wordL) { unsigned __int128 res_u128 = ((unsigned __int128)wordH << 64) | wordL; memcpy(dst, &res_u128, sizeof(*dst)); // hopefully __int128 and __float128 have the endianness } // return number of leading zeros static __inline int normalize_float128(uint64_t* pMantH, uint64_t* pMantL) { const uint64_t MANT_H_MSK = (uint64_t)-1 >> 16; uint64_t mantH = *pMantH, mantL = *pMantL; int lz; if (mantH == 0) { lz = __builtin_clzll(mantL); mantL <<= lz + 1; // shift out all leading zeroes and first leading 1 mantH = mantL >> 16; mantL = mantL << 48; lz += 48; } else { lz = __builtin_clzll(mantH)-16; mantH = (mantH << (lz+1)) | mantL >> (63-lz); mantL = mantL << (lz + 1); } mantH &= MANT_H_MSK; *pMantH = mantH; *pMantL = mantL; return lz; } void my__divtf3(__float128* result, const __float128* num, const __float128* den) { typedef unsigned __int128 __uint128; __uint128 u_num, u_den; memcpy(&u_num, num, sizeof(u_num)); memcpy(&u_den, den, sizeof(u_den)); const uint64_t BIT_30 = (uint64_t)1 << 30; const uint64_t BIT_48 = (uint64_t)1 << 48; const uint64_t BIT_63 = (uint64_t)1 << 63; const uint64_t SIGN_BIT = BIT_63; const uint64_t SIGN_OFF_MSK = ~SIGN_BIT; const uint64_t MANT_H_MSK = (uint64_t)-1 >> 16; const uint64_t EXP_MSK = (uint64_t)0x7FFF << 48; const uint64_t pNAN_MSW = (uint64_t)0x7FFF8 << 44; // +NaN const uint64_t pINF_MSW = EXP_MSK; // +Inf // parse num and handle special cases uint64_t numHi = (uint64_t)(u_num >> 64); uint64_t numLo = (uint64_t)u_num; uint64_t denHi = (uint64_t)(u_den >> 64); uint64_t denLo = (uint64_t)u_den; unsigned numSexp = numHi >> 48; unsigned numExp = numSexp & 0x7FFF; int inumExp = numExp; uint64_t numMantHi = numHi & MANT_H_MSK; uint64_t resSign = (numHi ^ denHi) & SIGN_BIT; if (numExp - 1 >= 0x7FFF - 1) { // special cases if (numExp == 0x7FFF) { // inf or Nan if ((numMantHi | numLo)==0) { // inf numHi = resSign | pINF_MSW; // Inf with proper sign if ((denLo == 0) && ((denHi & SIGN_OFF_MSK)==pINF_MSW)) { // den is inf numHi = pNAN_MSW; // Inf/Inf => NaN } } set__float128(result, numHi, numLo); return; } // numExp == 0 -> zero or sub-normal if (((numHi & SIGN_OFF_MSK) | numLo)==0) { // zero if ((denHi & EXP_MSK)==pINF_MSW) { // den = inf or Nan if ((denLo == 0) && ((denHi & SIGN_OFF_MSK)==pINF_MSW)) { // den is inf numHi = resSign; // return zero with proper sign } else { // den is Nan // return den numLo = denLo; numHi = denHi; } } if (((denHi & SIGN_OFF_MSK) | denLo)==0) { // den is zero numHi = pNAN_MSW; // 0/0 => NaN } set__float128(result, numHi, numLo); return; } // num is sub-normal: normalize inumExp -= normalize_float128(&numMantHi, &numLo); } // // num is normal or normalized // // parse den and handle special cases unsigned denSexp = denHi >> 48; unsigned denExp = denSexp & 0x7FFF; int idenExp = denExp; uint64_t denMantHi = denHi & MANT_H_MSK; if (denExp - 1 >= 0x7FFF - 1) { // special cases if (denExp == 0x7FFF) { // inf or Nan if ((denMantHi | denLo)==0) { // den is inf // return zero with proper sign denHi = resSign; denLo = 0; } // if den==Nan then return den set__float128(result, denHi, denLo); return; } // denExp == 0 -> zero or sub-normal if (((denHi & SIGN_OFF_MSK) | denLo)==0) { // den is zero // return Inf with proper sign numHi = resSign | pINF_MSW; numLo = 0; set__float128(result, numHi, numLo); return; } // den is sub-normal: normalize idenExp -= normalize_float128(&denMantHi, &denLo); } // // both num and den are normal or normalized // // calculate exponent of result int expDecr = (denMantHi > numMantHi) | ((denMantHi == numMantHi) & (denLo > numLo)); // mant(den) >= mant(num) int iResExp = inumExp - idenExp - expDecr + 0x3FFF; if (iResExp >= 0x7FFF) { // Overflow // return Inf with proper sign set__float128(result, resSign | pINF_MSW, 0); return; } if (iResExp < -112) { // underflow // return Zero with proper sign set__float128(result, resSign, 0); return; } // calculate 64-bit approximation from below for 1/mantissa of den static const uint8_t recip_tab[64] = { // floor(1/(1+i/32)*512) - 256 248, 240, 233, 225, 218, 212, 205, 199, 192, 186, 180, 175, 169, 164, 158, 153, 148, 143, 138, 134, 129, 125, 120, 116, 112, 108, 104, 100, 96 , 92 , 88 , 85 , 81 , 78 , 74 , 71 , 68 , 65 , 62 , 59 , 56 , 53 , 50 , 47 , 44 , 41 , 39 , 36 , 33 , 31 , 28 , 26 , 24 , 21 , 19 , 17 , 14 , 12 , 10 , 8 , 6 , 4 , 2 , 0 , }; // Look for approximate value of recip(x)=1/x // where x = 1:denMantHi:denMantLo unsigned r0 = recip_tab[denMantHi >> (48-6)]+256; // scale = 2**9 // r0 is low estimate, r0*x is in range [0.98348999..1.0] // Est. Precisions: ~5.92 bits // Improve precision of r by two NR iterations uint64_t denMant30 = (denMantHi >> 18)+BIT_30+1; // scale = 2**30, up to 32 bits // Only 30 upper bits of mantissa used in evaluation. Use of minimal amount of input bits // simplifies validation by exhausting with minimal negative effect on worst-case precision // LSB of denMant30 is incremented by 1 in order to assure that // for any possible values of LS bits of denMantHi:Lo // the estimate r1 is lower than exact value of r const uint64_t TWO_SCALE60 = ((uint64_t)2 << 60) - 1; // scale = 2**60, 61 bit uint64_t r1 = (uint64_t)r0 << 21; // scale = 2**30, 30 bits r1 = (((TWO_SCALE60 - r1 * denMant30) >> 29)*r1) >> 31; r1 = (((TWO_SCALE60 - r1 * denMant30) >> 29)*r1) << 3;// scale = 2**64 // r1 is low estimate // r1 is in range [0x7ffffffeffffffe8..0xfffffefbffc00000] // r1*x is in range [0.9999999243512687..0.9999999999999999954] // Est. Precisions: ~23.65 bits // // Further improve precision of the estimate by canonical 2nd-order NR iteration // r2 = r1 * (P2*(r0*r0*x)**2 + P1*(r0*r0*x) + P0) // where // P2 = +1 // P1 = -3 // P0 = +3 // At this point precision is of high importance, so evaluation is done in a way // that minimizes impact of truncation while still doing majority of the work with // efficient 64-bit arithmetic. // The transformed equation is r2 = r1 + r1*rx1_n*(rx1_n + 1) // where rx1_n = 1 - r1*x // uint64_t denMant64 = (denMantHi << 16)|(denLo >> 48); // Upper 64 bits of mantissa, not including leading 1 uint64_t rx1 = umulh64(r1,denMant64) + r1 + 2; // r1*(2**64+x+1), scale = 2**64; nbits=64 // rx1 calculated with x2_u=x+1 instead of x2=x, in order to assure that // for any possible values of 48 LS bits of x, the estimate r2 is lower than exact values of r uint64_t rx1_n = 0 - rx1; // rx1_n=1-r1*x , scale = 2**64, nbits=42 uint64_t termRB2 = umulh64(rx1_n, r1); // rx1_n*r1 , scale = 2**64, nbits=42 uint64_t deltaR1 = umulh64(termRB2,rx1_n)+termRB2; // termRB2*(rx1_n+1) , scale = 2**64, nbits=43 uint64_t r2 = r1 + deltaR1; // scale = 2**64, nbits=64 // r2 is low estimate, // r2 is in range [0x7fffffffffffffff..0xfffffffffffffffe] // r2*x is in range [0.999_999_999_999_999_999_674_994 .. 0.999_999_999_999_999_999_999_726] // Est. Precisions: ~61.41 bits // max(r2*x-1) ~= 5.995 * 2**-64 uint64_t numMsw = ((numMantHi << 15) | (numLo >> (64-15))) + BIT_63; // scale = 2**63 uint64_t res1 = umulh64(numMsw, r2) << expDecr; // scale = 2**63 __uint128 denx = ((__uint128)(denMantHi | BIT_48) << 64) | denLo; // scale = 2**112 uint64_t prod1 = (uint64_t)((res1*(denx << 11)) >> 64)+1; // scale = 2**122 uint64_t num122Lo = numLo << (10+expDecr); // scale = 2**122 uint64_t deltaProd= num122Lo - prod1; // scale = 2**122 uint64_t deltaRes = umulh64(deltaProd, r2); // scale = 2**122 // Round deltaRes to scale = 2**112 const unsigned N_GUARD_BITS = 10; const unsigned MAX_ERR = 3; // scale 2**122 __uint128 res2x; if (iResExp > 0) { // normal result const unsigned GUARD_BIT = 1 << (N_GUARD_BITS-1); // scale 2**122 const unsigned GUARD_MSK = GUARD_BIT*2-1; // scale 2**122 uint64_t round_incr = GUARD_BIT; unsigned guard_bits = deltaRes & GUARD_MSK; if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) { // Guard bits are very close to the half-ULP (from below) // Direction of the rounding should be found by additional calculations __uint128 midRes = (((__uint128)res1 << 50)+(deltaRes >> 9)) | 1; // scale 2**113 // midRes resides between two representable output points __uint128 midProduct = midRes*denx; // scale 2**225 uint64_t midProductH = (uint64_t)(midProduct >> 64); // scale 2**161 if ((midProductH >> (161-113+expDecr)) & 1) round_incr = GUARD_MSK; // when guard bit of the product = 1, it means that the product of midRes < num, so we need to round up } deltaRes = (deltaRes + round_incr) >> 10; // scale = 2**112 res2x = ((__uint128)res1 << 49) + deltaRes; // scale = 2**112 } else { // sub-normal result int shift = 1 - iResExp; // range [1:113] const __uint128 GUARD_BIT = (__uint128)1 << (shift+N_GUARD_BITS-1); // scale = 2**122 const __uint128 GUARD_MSK = GUARD_BIT*2 - 1; // scale = 2**122 __uint128 resx = ((__uint128)res1 << 59) + deltaRes; // scale = 2**122 __uint128 guard_bits = resx & GUARD_MSK; resx >>= (shift+N_GUARD_BITS-2); // scale = 2**(114-shift) unsigned round_incr = 2; if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) { // Guard bits are very close to the half-ULP (from below) // Direction of the rounding should be found by additional calculations __uint128 midRes = resx ^ 3; // scale 2**(114-shift, 2 LS bits changed from '01' to '10' // midRes resides between two representable output points __uint128 midProduct = midRes*denx; // scale 2**(226-shift) midProduct -= ((unsigned)resx >> 2) & 1; // Decrement a product when LS Bit of non-rounded result=='1' // That's a dirty trick that breaks ties toward nearest even. // Only necessary in subnormal case. For normal results tie is impossible int guardPos = 226-113+expDecr-shift; uint64_t midProductGuardBit = (uint64_t)(midProduct >> guardPos) & 1; // When a guard bit of the product = 1, it means that the product of midRes*den < num // Which means that we have to round up round_incr += midProductGuardBit; } res2x = (resx + round_incr) >> 2; iResExp = 0; } numMantHi = (res2x >> 64) & MANT_H_MSK; numLo = res2x; numHi = ((uint64_t)(iResExp) << 48) | numMantHi | resSign; set__float128(result, numHi, numLo); }
It turned out that on all micro-architectures that I care about (and majority of those that I don't care) double precision floating point division is quite fast. It's so fast that it easily beats my clever reciprocal estimator. Which means that this simple code is faster than complicated code above. Not by much, but faster. And much simpler. #include <stdint.h> #include <string.h> #include <quadmath.h> static __inline uint64_t umulx64(uint64_t mult1, uint64_t mult2, int rshift) { return (uint64_t)(((unsigned __int128)mult1 * mult2)>>rshift); } static __inline uint64_t umulh64(uint64_t mult1, uint64_t mult2) { return umulx64(mult1, mult2, 64); } static __inline void set__float128(__float128* dst, uint64_t wordH, uint64_t wordL) { unsigned __int128 res_u128 = ((unsigned __int128)wordH << 64) | wordL; memcpy(dst, &res_u128, sizeof(*dst)); // hopefully __int128 and __float128 have the endianness } // return number of leading zeros static __inline int normalize_float128(uint64_t* pMantH, uint64_t* pMantL) { const uint64_t MANT_H_MSK = (uint64_t)-1 >> 16; uint64_t mantH = *pMantH, mantL = *pMantL; int lz; if (mantH == 0) { lz = __builtin_clzll(mantL); mantL <<= lz + 1; // shift out all leading zeroes and first leading 1 mantH = mantL >> 16; mantL = mantL << 48; lz += 48; } else { lz = __builtin_clzll(mantH)-16; mantH = (mantH << (lz+1)) | mantL >> (63-lz); mantL = mantL << (lz + 1); } mantH &= MANT_H_MSK; *pMantH = mantH; *pMantL = mantL; return lz; } static __inline uint64_t d2u(double x) { uint64_t y; memcpy(&y, &x, sizeof(y)); return y; } static __inline double u2d(uint64_t x) { double y; memcpy(&y, &x, sizeof(y)); return y; } // calc_reciprocal // calculate lower estimate for r = floor(2**64/(1 + mantHi*2**(-48) + mantLo*2**(-112))) static __inline uint64_t calc_reciprocal(uint64_t mantHi, uint64_t mantLo) { const uint64_t DBL_1PLUS = ((uint64_t)0x3FF << 52) + 18; uint64_t mant_u = (mantHi << 4) + DBL_1PLUS; uint64_t r1 = d2u(2.0/u2d(mant_u)) << 11; // scale = 2**64, nbits=64 // r1 is low estimate // r1 is in range [0x7fffffffffffe000..0xfffffffffffee000 ] // r1*x is in range [0.99999999999999595..0.999999999999999889] // Est. Precisions: ~47.81 bits // // Improve precision of the estimate by canonical 1st-order NR iteration // r = r1*(2-r1*x) // At this point precision is of high importance, so evaluation is done in a way // that minimizes impact of truncation. // The transformed equation is r = r1 + r1*(1-r1*x) // uint64_t mant64 = (mantHi << 16)|(mantLo >> 48); // Upper 64 bits of mantissa, not including leading 1 uint64_t rx1 = umulh64(r1,mant64) + r1 + 2; // r1*(2**64+x+1), scale = 2**64; nbits=64 // rx1 calculated with x2_u=x+1 instead of x2=x, in order to assure that // for any possible values of 48 LS bits of x, the estimate r2 is lower than exact values of r uint64_t r = r1 + umulh64(0-rx1, r1); // scale = 2**64, nbits=64 // r is low estimate // r is in range [0x7fffffffffffffff..0xfffffffffffffffd] // r*x is in range [0.999_999_999_999_999_999_783..0.999_999_999_999_999_999_999_957] // Est. Precisions: ~62.00069 bits // max(r*x-1) ~= 3.998079 * 2**-64 return r; } void my__divtf3(__float128* result, const __float128* num, const __float128* den) { typedef unsigned __int128 __uint128; __uint128 u_num, u_den; memcpy(&u_num, num, sizeof(u_num)); memcpy(&u_den, den, sizeof(u_den)); const uint64_t BIT_48 = (uint64_t)1 << 48; const uint64_t BIT_63 = (uint64_t)1 << 63; const uint64_t SIGN_BIT = BIT_63; const uint64_t SIGN_OFF_MSK = ~SIGN_BIT; const uint64_t MANT_H_MSK = (uint64_t)-1 >> 16; const uint64_t EXP_MSK = (uint64_t)0x7FFF << 48; const uint64_t pNAN_MSW = (uint64_t)0x7FFF8 << 44; // +NaN const uint64_t pINF_MSW = EXP_MSK; // +Inf // const uint64_t nNAN_MSW = (uint64_t)0xFFFF8 << 44; // -NaN // parse num and handle special cases uint64_t numHi = (uint64_t)(u_num >> 64); uint64_t numLo = (uint64_t)u_num; uint64_t denHi = (uint64_t)(u_den >> 64); uint64_t denLo = (uint64_t)u_den; unsigned numSexp = numHi >> 48; unsigned numExp = numSexp & 0x7FFF; int inumExp = numExp; uint64_t numMantHi = numHi & MANT_H_MSK; uint64_t resSign = (numHi ^ denHi) & SIGN_BIT; if (numExp - 1 >= 0x7FFF - 1) { // special cases if (numExp == 0x7FFF) { // inf or Nan if ((numMantHi | numLo)==0) { // inf numHi = resSign | pINF_MSW; // Inf with proper sign if ((denLo == 0) && ((denHi & SIGN_OFF_MSK)==pINF_MSW)) { // den inf numHi = pNAN_MSW; // Inf/Inf => NaN } } set__float128(result, numHi, numLo); return; } // numExp == 0 - zero or sub-normal if (((numHi & SIGN_OFF_MSK) | numLo)==0) { // zero if ((denHi & EXP_MSK)==pINF_MSW) { // den = inf or Nan if ((denLo == 0) && ((denHi & SIGN_OFF_MSK)==pINF_MSW)) { // den= inf numHi = resSign; // return zero with proper sign } else { // den= Nan // return den numLo = denLo; numHi = denHi; } } if (((denHi & SIGN_OFF_MSK) | denLo)==0) { // den is zero numHi = pNAN_MSW; // 0/0 => NaN } set__float128(result, numHi, numLo); return; } // num is sub-normal: normalize inumExp -= normalize_float128(&numMantHi, &numLo); } // // num is normal or normalized // // parse den and handle special cases unsigned denSexp = denHi >> 48; unsigned denExp = denSexp & 0x7FFF; int idenExp = denExp; uint64_t denMantHi = denHi & MANT_H_MSK; if (denExp - 1 >= 0x7FFF - 1) { // special cases if (denExp == 0x7FFF) { // inf or Nan if ((denMantHi | denLo)==0) { // den is inf // return zero with proper sign denHi = resSign; denLo = 0; } // if den==Nan then return den set__float128(result, denHi, denLo); return; } // denExp == 0 - zero or sub-normal if (((denHi & SIGN_OFF_MSK) | denLo)==0) { // den is zero // return Inf with proper sign numHi = resSign | pINF_MSW; numLo = 0; set__float128(result, numHi, numLo); return; } // den is sub-normal: normalize idenExp -= normalize_float128(&denMantHi, &denLo); } // // both num and den are normal or normalized // // calculate exponent of result int expDecr = (denMantHi > numMantHi) | ((denMantHi == numMantHi) & (denLo > numLo)); // mant(den) >= mant(num) int iResExp = inumExp - idenExp - expDecr + 0x3FFF; if (iResExp >= 0x7FFF) { // Overflow // return Inf with proper sign set__float128(result, resSign | pINF_MSW, 0); return; } if (iResExp < -112) { // underflow // return Zero with proper sign set__float128(result, resSign, 0); return; } // calculate 64-bit approximation from below for 1/mantissa of den uint64_t r = calc_reciprocal(denMantHi, denLo); // r is low estimate // r is in range [0x7fffffffffffffff..0xfffffffffffffffd] // r*x is in range [0.999_999_999_999_999_999_783..0.999_999_999_999_999_999_999_957] // Est. Precisions: ~62.00069 bits // max(r*x-1) ~= 3.998079 * 2**-64 uint64_t numMsw = ((numMantHi << 15) | (numLo >> (64-15))) + BIT_63; // scale = 2**63 uint64_t res1 = umulh64(numMsw, r) << expDecr; // scale = 2**63 __uint128 denx = ((__uint128)(denMantHi | BIT_48) << 64) | denLo; // scale = 2**112 uint64_t prod1 = (uint64_t)((res1*(denx << 11)) >> 64)+1; // scale = 2**122 uint64_t num122Lo = numLo << (10+expDecr); // scale = 2**122 uint64_t deltaProd= num122Lo - prod1; // scale = 2**122 uint64_t deltaRes = umulh64(deltaProd, r); // scale = 2**122 // Round deltaRes to scale = 2**112 const unsigned N_GUARD_BITS = 10; const unsigned MAX_ERR = 3; // scale 2**122 __uint128 res2x; if (iResExp > 0) { // normal result const unsigned GUARD_BIT = 1 << (N_GUARD_BITS-1); // scale 2**122 const unsigned GUARD_MSK = GUARD_BIT*2-1; // scale 2**122 uint64_t round_incr = GUARD_BIT; unsigned guard_bits = deltaRes & GUARD_MSK; if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) { // Guard bits are very close to the half-ULP (from below) // Direction of the rounding should be found by additional calculations __uint128 midRes = (((__uint128)res1 << 50)+(deltaRes >> 9)) | 1; // scale 2**113 // midRes resides between two representable output points __uint128 midProduct = midRes*denx; // scale 2**225 uint64_t midProductH = (uint64_t)(midProduct >> 64); // scale 2**161 if ((midProductH >> (161-113+expDecr)) & 1) round_incr = GUARD_MSK; // when guard bit of the product = 1, it means that the product of midRes < num, so we need to round up } deltaRes = (deltaRes + round_incr) >> 10; // scale = 2**112 res2x = ((__uint128)res1 << 49) + deltaRes; // scale = 2**112 } else { // sub-normal result int shift = 1 - iResExp; // range [1:113] const __uint128 GUARD_BIT = (__uint128)1 << (shift+N_GUARD_BITS-1); // scale = 2**122 const __uint128 GUARD_MSK = GUARD_BIT*2 - 1; // scale = 2**122 __uint128 resx = ((__uint128)res1 << 59) + deltaRes; // scale = 2**122 __uint128 guard_bits = resx & GUARD_MSK; resx >>= (shift+N_GUARD_BITS-2); // scale = 2**(114-shift) unsigned round_incr = 2; if (guard_bits - (GUARD_BIT-MAX_ERR) < MAX_ERR) { // Guard bits are very close to the half-ULP (from below) // Direction of the rounding should be found by additional calculations __uint128 midRes = resx ^ 3; // scale 2**(114-shift, 2 LS bits changed from '01' to '10' // midRes resides between two representable output points __uint128 midProduct = midRes*denx; // scale 2**(226-shift) midProduct -= ((unsigned)resx >> 2) & 1; // Decrement a product when LS Bit of non-rounded result=='1' // That's a dirty trick that breaks ties toward nearest even. // Only necessary in subnormal case. For normal results tie is impossible int guardPos = 226-113+expDecr-shift; uint64_t midProductGuardBit = (uint64_t)(midProduct >> guardPos) & 1; // When a guard bit of the product = 1, it means that the product of midRes*den < num // Which means that we have to round up round_incr += midProductGuardBit; } res2x = (resx + round_incr) >> 2; iResExp = 0; } numMantHi = (res2x >> 64) & MANT_H_MSK; numLo = res2x; numHi = ((uint64_t)(iResExp) << 48) | numMantHi | resSign; set__float128(result, numHi, numLo); }
@Michael: Now that gcc 12 is out of the door, I would suggest we try to get your code into the gcc tree for gcc 13. It should follow the gcc style guidelines, which is quite trivial to do: Comments should be in /* - */ format, and clang-format --style=gnu does the rest. (If you prefer, I can also handle that). Regarding copyright, you can either assign it to the FSF (which would take a bit) or, probably easier, you can use the Signed-off-by: tag as per https://gcc.gnu.org/contribute.html#legal . You would then submit the patch to gcc-patches@gcc.gnu.org; if you prefer, I can also help a bit with the style issues (ChangeLog etc) for that. It would then be reviewed and, if any problems identified are fixed, applied - which I can also do. How does that sound?
From what I can see, it is mostly integral implementation and we already have one such in GCC, so the question is if we just shouldn't use it (most of the sources are in libgcc/soft-fp/ , then configuration files in libgcc/config/*/sfp-* and the simple *.c file that uses the soft-fp macros is in glibc in sysdeps/ieee754/soft-fp/s_fsqrtl.c ).
(In reply to Thomas Koenig from comment #14) > @Michael: Now that gcc 12 is out of the door, I would suggest we try to get > your code into the gcc tree for gcc 13. > > It should follow the gcc style guidelines, which is quite trivial to do: > Comments should be in /* - */ format, and clang-format --style=gnu does the > rest. > (If you prefer, I can also handle that). > > Regarding copyright, you can either assign it to the FSF (which would take > a bit) or, probably easier, you can use the Signed-off-by: tag as per > https://gcc.gnu.org/contribute.html#legal . > > You would then submit the patch to gcc-patches@gcc.gnu.org; if you prefer, > I can also help a bit with the style issues (ChangeLog etc) for that. > It would then be reviewed and, if any problems identified are fixed, > applied - which I can also do. > > How does that sound? Sounds not bad except I would want to have a copy of my own part of the work in my public github repo to share it with world under MIT or BSD or plain giveaway license. Is it a problem? Also, if we start than it makes sense to replace more common routines (add, mul and esp. fma) as well. Also, I suspect that transcendental, esp trigs, can be improved by quite a lot if they are currently done in the style, similar to sqrt() and fma(). Also the code will be cleaner and marginally better performance-wise if we base it on clang-style add_carry64(). According to my understanding, gcc is going to implement it in the future anyway, so why not now? As to this specific routines, the variants posted here still contain one bug in handling of sub-normal inputs (a corner case of lowest non-zero). Also, for square root I have variant that is a little simple and performs somewhat better. And commented better.
(In reply to Jakub Jelinek from comment #15) > From what I can see, it is mostly integral implementation and we already > have one such in GCC, so the question is if we just shouldn't use it (most > of the sources > are in libgcc/soft-fp/ , then configuration files in libgcc/config/*/sfp-* > and > the simple *.c file that uses the soft-fp macros is in glibc in > sysdeps/ieee754/soft-fp/s_fsqrtl.c ). If you have good implementation, what was the reasoning behind shipping bad implementation?
libquadmath is essentially legacy code. People working directly in C should be using the C23 _Float128 interfaces and *f128 functions, as in current glibc, rather than libquadmath interfaces (unless their code needs to support old glibc or non-glibc C libraries that don't support _Float128 in C23 Annex H). It would be desirable to make GCC generate *f128 calls when appropriate from Fortran code using this format as well; see <https://gcc.gnu.org/pipermail/gcc-patches/2021-September/578937.html> for more discussion of the different cases involved. Most of libquadmath is derived from code in glibc - some of it can now be updated from the glibc code automatically (see update-quadmath.py), other parts can't (although it would certainly be desirable to extend update-quadmath.py to cover that other code as well). See the commit message for commit 4239f144ce50c94f2c6cc232028f167b6ebfd506 for a more detailed discussion of what code comes from glibc and what is / is not automatically handled by update-quadmath.py. Since update-quadmath.py hasn't been run for a while, it might need changes to work with more recent changes to the glibc code. sqrtq.c is one of the files not based on glibc code. That's probably because glibc didn't have a convenient generic implementation of binary128 sqrt to use when libquadmath was added - it has soft-fp implementations used for various architectures, but those require sfp-machine.h for each architecture (which maybe we do in fact have in libgcc for each relevant architecture, but it's an extra complication). Certainly making it possible to use code from glibc for binary128 sqrt would be a good idea, but while we aren't doing that, it should also be OK to improve sqrtq locally in libquadmath. The glibc functions for this format are generally *not* optimized for speed yet (this includes the soft-fp-based versions of sqrt). Note that what's best for speed may depend a lot on whether the architecture has hardware support for binary128 arithmetic; if it has such support, it's more likely an implementation based on binary128 floating-point operations is efficient; if it doesn't, direct use of integer arithmetic, without lots of intermediate packing / unpacking into the binary128 format, is likely to be more efficient. See the discussion starting at <https://sourceware.org/pipermail/libc-alpha/2020-June/thread.html#115229> for more on this - glibc is a better place for working on most optimized function implementations than GCC. See also <https://core-math.gitlabpages.inria.fr/> - those functions are aiming to be correctly rounding, which is *not* a goal for most glibc libm functions, but are still quite likely to be faster than the existing non-optimized functions in glibc. fma is a particularly tricky case because it *is* required to be correctly rounding, in all rounding modes, and correct rounding implies correct exceptions, *and* correct exceptions for fma includes getting right the architecture-specific choice of whether tininess is detected before or after rounding. Correct exceptions for sqrt are simpler, but to be correct for glibc it still needs to avoid spurious "inexact" exceptions - for example, from the use of double in intermediate computations in your version (see the optimized feholdexcept / fesetenv operations used in glibc for cases where exceptions from intermediate computations are to be discarded). For functions that aren't required to be correctly rounding, the glibc manual discusses the accuracy goals (including on exceptions, e.g. avoiding spurious "underflow" exceptions from intermediate computations for results where the rounded result returned is not consistent with rounding a tiny, inexact value).
(In reply to joseph@codesourcery.com from comment #18) > libquadmath is essentially legacy code. People working directly in C > should be using the C23 _Float128 interfaces and *f128 functions, as in > current glibc, rather than libquadmath interfaces (unless their code needs > to support old glibc or non-glibc C libraries that don't support _Float128 > in C23 Annex H). It would be desirable to make GCC generate *f128 calls > when appropriate from Fortran code using this format as well; see > <https://gcc.gnu.org/pipermail/gcc-patches/2021-September/578937.html> for > more discussion of the different cases involved. > On MSYS2 _Float128 and __float128 appears to be mostly the same thing, mapped to the same library routines with significant difference that _Float128 is not accessible from C++. Since all my test benches are written in C++ I can't even validate that what I wrote above is 100% true. Also according to my understanding of glibc docs (not the clearest piece of text that I ever read) a relevant square root routine should be named sqrtf128(). Unfortunately, nothing like that appears to be present in either math.h or in library. Am I doing something wrong? > Most of libquadmath is derived from code in glibc - some of it can now be > updated from the glibc code automatically (see update-quadmath.py), other > parts can't (although it would certainly be desirable to extend > update-quadmath.py to cover that other code as well). See the commit > message for commit 4239f144ce50c94f2c6cc232028f167b6ebfd506 for a more > detailed discussion of what code comes from glibc and what is / is not > automatically handled by update-quadmath.py. Since update-quadmath.py > hasn't been run for a while, it might need changes to work with more > recent changes to the glibc code. > > sqrtq.c is one of the files not based on glibc code. That's probably > because glibc didn't have a convenient generic implementation of binary128 > sqrt to use when libquadmath was added - it has soft-fp implementations > used for various architectures, but those require sfp-machine.h for each > architecture (which maybe we do in fact have in libgcc for each relevant > architecture, but it's an extra complication). Certainly making it > possible to use code from glibc for binary128 sqrt would be a good idea, > but while we aren't doing that, it should also be OK to improve sqrtq > locally in libquadmath. > > The glibc functions for this format are generally *not* optimized for > speed yet (this includes the soft-fp-based versions of sqrt). Note that > what's best for speed may depend a lot on whether the architecture has > hardware support for binary128 arithmetic; if it has such support, it's > more likely an implementation based on binary128 floating-point operations > is efficient; Not that simple. Right now, there are only two [gcc] platforms with hw binary128 - IBM POWER and IBM z. I am not sure about the later, but the former has xssqrtqp instruction which is likely the right way to do sqrtq()/sqrtf128() on this platform. If z is the same, which sound likely, then implementation based on binary128 mul/add/fma by now has no use cases at all. > if it doesn't, direct use of integer arithmetic, without > lots of intermediate packing / unpacking into the binary128 format, is > likely to be more efficient. It's not just redundant packing/unpacking. Direct integer implementation does fewer arithmetic operations as well, mainly because it know exactly which parts of 226-bit multiplication product are relevant and does not calculate parts that are irrelevant. And with integer math it is much easier to achieve correct rounding at corner cases that call for precision in excess of 226 bits, so even fmaq() is not enough. And yes, there is one or two cases like that. > See the discussion starting at > <https://sourceware.org/pipermail/libc-alpha/2020-June/thread.html#115229> > for more on this - glibc is a better place for working on most optimized > function implementations than GCC. See also > <https://core-math.gitlabpages.inria.fr/> - those functions are aiming to > be correctly rounding, which is *not* a goal for most glibc libm > functions, but are still quite likely to be faster than the existing > non-optimized functions in glibc. > > fma is a particularly tricky case because it *is* required to be correctly > rounding, in all rounding modes, and correct rounding implies correct > exceptions, *and* correct exceptions for fma includes getting right the > architecture-specific choice of whether tininess is detected before or > after rounding. I suspect that by strict IEEE-754 rules sqrt() is the same as fma(), i.e. you have to calculate a root to infinite precision and then to round with accordance to current mode. [O.T.] The whole rounding modes business complicates things quite a lot for very small benefit to the end users that almost never want anything but "round to nearest with tie broken to even". If I understand correctly, it's only mode supported by quadmath library and that makes good practical sense. [O.T.] > > Correct exceptions for sqrt are simpler, but to be correct for glibc it > still needs to avoid spurious "inexact" exceptions - for example, from the > use of double in intermediate computations in your version (see the > optimized feholdexcept / fesetenv operations used in glibc for cases where > exceptions from intermediate computations are to be discarded). I don't quite or understand why you say that. First, I don't remember using any double math in the variant of sqrtq() posted here. So, may be, you meant division? Here I use doable math, but IMO, I use it in a way that never causes exceptions. Not even inexact. Not that I think that inexact exception is a problem - during my whole not so short carrier as programmer I never encountered a real-world program (as opposed to demonstration of the feature) that enables this exception. > > For functions that aren't required to be correctly rounding, the glibc > manual discusses the accuracy goals That's interesting and certainly worthy to discuss. > (including on exceptions, e.g. > avoiding spurious "underflow" exceptions from intermediate computations > for results where the rounded result returned is not consistent with > rounding a tiny, inexact value). That's less interesting and less worthy to discuss. "Underflow/Inexact" is idiocy. Better ignored.
On Sat, 11 Jun 2022, already5chosen at yahoo dot com via Gcc-bugs wrote: > On MSYS2 _Float128 and __float128 appears to be mostly the same thing, mapped > to the same library routines with significant difference that _Float128 is not > accessible from C++. Since all my test benches are written in C++ I can't even > validate that what I wrote above is 100% true. > > Also according to my understanding of glibc docs (not the clearest piece of > text that I ever read) a relevant square root routine should be named > sqrtf128(). > Unfortunately, nothing like that appears to be present in either math.h or in > library. Am I doing something wrong? The function should be sqrtf128 (present in glibc 2.26 and later on x86_64, x86, powerpc64le, ia64). I don't know about support in non-glibc C libraries. > Right now, there are only two [gcc] platforms with hw binary128 - IBM POWER and > IBM z. I am not sure about the later, but the former has xssqrtqp instruction > which is likely the right way to do sqrtq()/sqrtf128() on this platform. If z > is the same, which sound likely, then implementation based on binary128 > mul/add/fma by now has no use cases at all. That may well be the case for sqrt. > > fma is a particularly tricky case because it *is* required to be correctly > > rounding, in all rounding modes, and correct rounding implies correct > > exceptions, *and* correct exceptions for fma includes getting right the > > architecture-specific choice of whether tininess is detected before or > > after rounding. > > I suspect that by strict IEEE-754 rules sqrt() is the same as fma(), i.e. you > have to calculate a root to infinite precision and then to round with > accordance to current mode. Yes, but fma has the extra complication of the architecture-specific tininess detection rules (to quote IEEE 754, "The implementer shall choose how tininess is detected [i.e. from the two options listed immediately above this quote in IEEE 754], but shall detect tininess in the same way for all operations in radix two"), which doesn't apply to sqrt because sqrt results can never underflow. (I expect the existing soft-fp version of fma in glibc to be rather better optimized than the soft-fp version of sqrt.) > I don't quite or understand why you say that. First, I don't remember using any > double math in the variant of sqrtq() posted here. So, may be, you meant > division? > Here I use doable math, but IMO, I use it in a way that never causes > exceptions. Yes, the double division. If that division can ever be inexact when the result of the square root itself is exact, you have a problem (which in turn could lead to actual incorrectly rounded results from other functions such as the square root operations rounding to a narrower type, when the round-to-odd versions of those functions are used, because the round-to-odd technique relies on correct "inexact" exceptions).
(In reply to joseph@codesourcery.com from comment #20) > On Sat, 11 Jun 2022, already5chosen at yahoo dot com via Gcc-bugs wrote: > > > On MSYS2 _Float128 and __float128 appears to be mostly the same thing, mapped > > to the same library routines with significant difference that _Float128 is not > > accessible from C++. Since all my test benches are written in C++ I can't even > > validate that what I wrote above is 100% true. > > > > Also according to my understanding of glibc docs (not the clearest piece of > > text that I ever read) a relevant square root routine should be named > > sqrtf128(). > > Unfortunately, nothing like that appears to be present in either math.h or in > > library. Am I doing something wrong? > > The function should be sqrtf128 (present in glibc 2.26 and later on > x86_64, x86, powerpc64le, ia64). I don't know about support in non-glibc > C libraries. x86-64 gcc on Godbolt does not appear to know about it. I think, Godbolt uses rather standard Linux with quite new glibc and headers. https://godbolt.org/z/Y4YecvxK6 > > > Right now, there are only two [gcc] platforms with hw binary128 - IBM POWER and > > IBM z. I am not sure about the later, but the former has xssqrtqp instruction > > which is likely the right way to do sqrtq()/sqrtf128() on this platform. If z > > is the same, which sound likely, then implementation based on binary128 > > mul/add/fma by now has no use cases at all. > > That may well be the case for sqrt. > > > > fma is a particularly tricky case because it *is* required to be correctly > > > rounding, in all rounding modes, and correct rounding implies correct > > > exceptions, *and* correct exceptions for fma includes getting right the > > > architecture-specific choice of whether tininess is detected before or > > > after rounding. > > > > I suspect that by strict IEEE-754 rules sqrt() is the same as fma(), i.e. you > > have to calculate a root to infinite precision and then to round with > > accordance to current mode. > > Yes, but fma has the extra complication of the architecture-specific > tininess detection rules (to quote IEEE 754, "The implementer shall choose > how tininess is detected [i.e. from the two options listed immediately > above this quote in IEEE 754], but shall detect tininess in the same way > for all operations in radix two"), which doesn't apply to sqrt because > sqrt results can never underflow. > > (I expect the existing soft-fp version of fma in glibc to be rather better > optimized than the soft-fp version of sqrt.) > May be. I don't know how to get soft-fp version. What I do know is that version of fmaq() shipped with gcc/gfortran on MSYS2 x86-64 is absurdly slow - order of 4 microsecond on quite fast modern CPUs. I don't know how you call this variant, hard, soft or may be freezing. sqrtq() is also slow, but 3 or 5 times faster than that. > > I don't quite or understand why you say that. First, I don't remember using any > > double math in the variant of sqrtq() posted here. So, may be, you meant > > division? > > Here I use doable math, but IMO, I use it in a way that never causes > > exceptions. > > Yes, the double division. If that division can ever be inexact when the > result of the square root itself is exact, you have a problem (which in > turn could lead to actual incorrectly rounded results from other functions > such as the square root operations rounding to a narrower type, when the > round-to-odd versions of those functions are used, because the > round-to-odd technique relies on correct "inexact" exceptions). It seems, you didn't pay attention that in my later posts I am giving implementations of binary128 *division* rather than sqrtq(). sqrtq() is in Comment 9. It's pure integer, no FP (double) math involved. Comment 12 is pure integer implementation of binary128 division. Again, no FP (double) math involved. Comment 13 is again implementation of binary128 division. It does use double math (division) internally. On modern Intel and AMD CPUs it is a little faster than one from Comment 12, but not radically so. Both arguments and result of double division are well-controlled, they never come close to subnormal range. All three implementations support only one rounding mode, a default (round to nearest with tie broken to even). They do not generate any exceptions, neither underflow nor even overflow. In case of overflow a division silently returns Inf with proper sign. Support for exceptions and for non-default rounding can be added relatively easily, but would undoubtedly cause a slowdown. I'd expect that the main performance bottleneck would be not even an additional tests and ifs (modern branch predictors are very good in sorting out uncommon cases), but reading of control "register" in thread-safe manner. BTW, I see no mentioning of rounding control or of any sort of exceptions in GCC libquadmath docs. No APIs with names resembling fesetround() or mpfr_set_default_rounding_mode().
On Mon, 13 Jun 2022, already5chosen at yahoo dot com via Gcc-bugs wrote: > > The function should be sqrtf128 (present in glibc 2.26 and later on > > x86_64, x86, powerpc64le, ia64). I don't know about support in non-glibc > > C libraries. > > x86-64 gcc on Godbolt does not appear to know about it. > I think, Godbolt uses rather standard Linux with quite new glibc and headers. > https://godbolt.org/z/Y4YecvxK6 Make sure to define _GNU_SOURCE or __STDC_WANT_IEC_60559_TYPES_EXT__ to get these declarations. > May be. I don't know how to get soft-fp version. For long double it's sysdeps/ieee754/soft-fp/s_fmal.c in glibc - some adjustments would be needed to be able to use that as a version for _Float128 (where sysdeps/ieee754/float128/s_fmaf128.c currently always uses the ldbl-128 version), in appropriate cases. > It seems, you didn't pay attention that in my later posts I am giving > implementations of binary128 *division* rather than sqrtq(). Ah - binary128 division is nothing to do with libquadmath at all (the basic arithmetic operations go in libgcc, not libquadmath). Using a PR about one issue as an umbrella discussion of various vaguely related things is generally confusing and unhelpful to tracking the status of what is or is not fixed. In general, working out how to optimize the format-generic code in soft-fp if possible would be preferred to writing format-specific implementations. Note that for multiplication and division there are already various choices of implementation approaches that can be selected via macros defined in sfp-machine.h. > BTW, I see no mentioning of rounding control or of any sort of exceptions in > GCC libquadmath docs. No APIs with names resembling fesetround() or > mpfr_set_default_rounding_mode(). The underlying arithmetic (in libgcc, not libquadmath) uses the hardware rounding mode and exceptions (if the x87 and SSE rounding modes disagree, things are liable to go wrong), via various macros defined in sfp-machine.h.
(In reply to joseph@codesourcery.com from comment #22) > On Mon, 13 Jun 2022, already5chosen at yahoo dot com via Gcc-bugs wrote: > > > > The function should be sqrtf128 (present in glibc 2.26 and later on > > > x86_64, x86, powerpc64le, ia64). I don't know about support in non-glibc > > > C libraries. > > > > x86-64 gcc on Godbolt does not appear to know about it. > > I think, Godbolt uses rather standard Linux with quite new glibc and headers. > > https://godbolt.org/z/Y4YecvxK6 > > Make sure to define _GNU_SOURCE or __STDC_WANT_IEC_60559_TYPES_EXT__ to > get these declarations. > Thank you. It made a difference on Goldbolt. Does not help MSYS2, but at least I will be able to test on Linux. > > May be. I don't know how to get soft-fp version. > > For long double it's sysdeps/ieee754/soft-fp/s_fmal.c in glibc - some > adjustments would be needed to be able to use that as a version for > _Float128 (where sysdeps/ieee754/float128/s_fmaf128.c currently always > uses the ldbl-128 version), in appropriate cases. > Way to complicated for mere gcc user like myself. Hopefully, Thomas Koenig will understand better. > > It seems, you didn't pay attention that in my later posts I am giving > > implementations of binary128 *division* rather than sqrtq(). > > Ah - binary128 division is nothing to do with libquadmath at all (the > basic arithmetic operations go in libgcc, not libquadmath). Using a PR > about one issue as an umbrella discussion of various vaguely related > things is generally confusing and unhelpful to tracking the status of what > is or is not fixed. You are right. Again, hopefully, Thomas Koenig will open a thread in which a discussion of all issues related to libquadmath and binary128 will be on topic. But, formally speaking, slowness of division or of fma are not bugs. So, may be, there are better places than bugzilla to discuss them. I just don't know what place it is. The good thing about bugzilla is a convenient forum-like user interface. > > In general, working out how to optimize the format-generic code in soft-fp > if possible would be preferred to writing format-specific implementations. > Note that for multiplication and division there are already various > choices of implementation approaches that can be selected via macros > defined in sfp-machine.h. > I am not sure that I like an idea of format-generic solution for specific case of soft binary128. binary128 isvery special, because it is the smallest. It can gain ALOT of speed from format-specific handling. Not sure if 3x, but pretty certain about 2x. > > BTW, I see no mentioning of rounding control or of any sort of exceptions in > > GCC libquadmath docs. No APIs with names resembling fesetround() or > > mpfr_set_default_rounding_mode(). > > The underlying arithmetic (in libgcc, not libquadmath) uses the hardware > rounding mode and exceptions (if the x87 and SSE rounding modes disagree, > things are liable to go wrong), via various macros defined in > sfp-machine.h. Oh, a mess! With implementation that is either 99% or 100% integer being controlled by SSE control is WRONG. x87 control word, of course, is no better than SSE. But BOTH !!!! I have no words. Hopefully, libquadmath is saner than that, but I am unable to figure it out from docs.
On Mon, 13 Jun 2022, already5chosen at yahoo dot com via Gcc-bugs wrote: > > For long double it's sysdeps/ieee754/soft-fp/s_fmal.c in glibc - some > > adjustments would be needed to be able to use that as a version for > > _Float128 (where sysdeps/ieee754/float128/s_fmaf128.c currently always > > uses the ldbl-128 version), in appropriate cases. > > > > Way to complicated for mere gcc user like myself. > Hopefully, Thomas Koenig will understand better. glibc needs to handle a lot of different configurations with various choices of supported floating-point types - resulting in complexity around how the particular function implementations are chosen for a given system - as well as other portability considerations. There is also complexity resulting from the functions covering many different use cases - and thus needing to follow all the IEEE 754 requirements for those functions although many users may only care about some of those requirements. > > The underlying arithmetic (in libgcc, not libquadmath) uses the hardware > > rounding mode and exceptions (if the x87 and SSE rounding modes disagree, > > things are liable to go wrong), via various macros defined in > > sfp-machine.h. > > Oh, a mess! > With implementation that is either 99% or 100% integer being controlled by SSE > control is WRONG. x87 control word, of course, is no better than SSE. > But BOTH !!!! I have no words. Any given libgcc build will only use one of the rounding modes (SSE for 64-bit, x87 for 32-bit) - but which exception state gets updated in the 32-bit case depends on whether libgcc was built for SSE arithmetic. As far as IEEE 754 is concerned, there is only one rounding mode for all operations with a binary result (and a separate rounding mode for decimal FP results). As far as the C ABI is concerned, it's not valid for the two rounding modes to be different at any ABI boundary; fesetround will always set both, while fetestexcept etc. handle both sets of exception flags by ORing them together. *But* glibc's internal optimizations for code that saves and restores floating-point state internally try to manipulate only SSE state, or only x87 state, in functions that should only use one of those two sets of state, so code called from within those functions may need to work properly when the two rounding modes are different. Together with the above point about which exception state gets used in libgcc support for _Float128, this results in those optimizations, in the float128 case, sometimes only needing to handle SSE state but sometimes needing to handle both sets of state (see glibc's sysdeps/x86/fpu/fenv_private.h).