/* gcc-4.6.0 floating-point optimization regression on ia64-Linux % gcc -O1 -o foo foo.c % foo g3 = 35f2349dc34f3df6 i[1]= 3ca1a62633145c07 i[2]= b92f1976b7ed8fbc j[1]= 3ca4d57ee2b1013a j[2]= b92618713a31d3e2 val = 35bfd0f99254c580 8.53973422267356774285 402114580b45d475 % % % gcc -O2 -o foo foo.c % foo g3 = 35f2349dc34f3df6 i[1]= 3ca1a62633145c07 i[2]= b92f1976b7ed8fbc j[1]= 3ca4d57ee2b1013a j[2]= b92618713a31d3e2 val = 35bfd0f99254c590 8.53973422267356596649 402114580b45d474 **ERROR** QD_check failed at x[0] % % % gcc-4.5.1 -O2 -o foo foo.c % foo g3 = 35f2349dc34f3df6 i[1]= 3ca1a62633145c07 i[2]= b92f1976b7ed8fbc j[1]= 3ca4d57ee2b1013a j[2]= b92618713a31d3e2 val = 35bfd0f99254c590 8.53973422267356774285 402114580b45d475 % # note that this code compiles and runs successfully # with gcc-4.5.1 -03 % gcc -v Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/usr/local/gcc-4.6.0/ia64-Linux-gmp-5.0.1/libexec/gcc/ia64-unknown-linux-gnu/4.6.0/lto-wrapper Target: ia64-unknown-linux-gnu Configured with: /usr/local/gcc-4.6.0/src/gcc-4.6.0/configure --enable-languages=c,c++,fortran --with-gnu-as --with-as=/usr/local/binutils-2.21/ia64-Linux-rhel-gcc-4.5.1/bin/as --with-gnu-ld --with-ld=/usr/local/binutils-2.21/ia64-Linux-rhel-gcc-4.5.1/bin/ld --with-gmp=/usr/local/gmp-5.0.1/ia64-Linux-rhel-gcc-4.5.0 --with-mpfr=/usr/local/mpfr-3.0.1/ia64-Linux-rhel-gmp-5.0.1-gcc-4.6.0 --with-mpc=/usr/local/mpc-0.9/ia64-Linux-rhel-gmp-5.0.1-mpfr-3.0.1-gcc-4.6.0 --prefix=/usr/local/gcc-4.6.0/ia64-Linux-gmp-5.0.1 Thread model: posix gcc version 4.6.0 (GCC) % */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <fpu_control.h> double QD_e[4]={2.718281828459045091e+00,1.445646891729250158e-16, -2.127717108038176765e-33,1.515630159841218954e-49}; double QD_pi[4]= {3.141592653589793116e+00,1.224646799147353207e-16, -2.994769809718339666e-33,1.112454220863365282e-49}; #define QD_qsum(a,b,s,e) {double z1; s=a+b; z1=s-a; e=b-z1;} #define QD_sum(a,b,s,e)\ {double bb,z1,z2; s=a+b; bb=s-a; z1=s-bb; z2=a-z1; z1=b-bb; e=z1+z2;} #define QD_split(a,hi,lo)\ {double xt=134217729.0*a,z1; z1=xt-a; hi=xt-z1; lo=a-hi;} #define QD_prod(a,b,p,e)\ {double ah,al,bh,bl,z1,z2,z3,z4; p=a*b; QD_split(a,ah,al); QD_split(b,bh,bl);\ z1=ah*bh; z2=z1-p; z1=ah*bl; z3=al*bh; z4=z1+z3; z1=z2+z4; z3=al*bl; e=z1+z3;} double QD_mul(double *i,double *j,double *o) { double a0,a1,b1,b2,c1,c2,d2,d3,e2,e3,f2,f3; double g3,h1,h2,h3,k2,k3,l2,l3,m2,m3,n2,n3,p2,p3; double t1,t2,t3; double a2, a3,o0; double val; QD_prod(i[0],j[0],a0,a1); QD_prod(i[0],j[1],b1,b2); QD_prod(i[1],j[0],c1,c2); QD_prod(i[0],j[2],d2,d3); QD_prod(i[2],j[0],e2,e3); QD_prod(i[1],j[1],f2,f3); g3=i[0]*j[3]+i[3]*j[0]; QD_sum(a1,b1,t1,t2); QD_sum(c1,t1,h1,t3); QD_sum(t2,t3,h2,h3); QD_sum(b2,c2,k2,k3); QD_sum(d2,e2,l2,l3); QD_sum(f2,h2,m2,m3); printf("g3 = %lx\n", g3); printf("i[1]= %lx\n", i[1]); printf("i[2]= %lx\n", i[2]); printf("j[1]= %lx\n", j[1]); printf("j[2]= %lx\n", j[2]); g3+=(i[1]*j[2]+i[2]*j[1]); val = g3; /* printing g3 here makes the bug disappear, so use return */ QD_sum(k2,l2,n2,n3); QD_sum(m2,n2,p2,p3); g3+=d3+e3+f3+h3+k3+l3+m3+n3+p3; QD_qsum(p2,g3,a2,a3); QD_qsum(h1,a2,b1,b2); QD_qsum(a0,b1,o0,c1); o[0]=o0; QD_qsum(c1,b2,o[1],d2); QD_qsum(d2,a3,o[2],o[3]); return val; } void errorit(S) char *S; {printf("**ERROR** %s\n",S); exit(-1);} int main() { double x[4]; double val; /* This fpu_control code should set the control word of the floating point processor to round to an IEEE-754 double (53-bit mantissa, 64-bits in total), rather than the default extended precision However the bug also occurs if this fpu_control code is removed or commented out. */ fpu_control_t fpu_control=0x027f; _FPU_SETCW(fpu_control); val = QD_mul(QD_pi,QD_e,x); printf("val = %lx\n", val); printf("%1.20f\n", x[0]); printf("%lx\n", x[0]); if (x[0]!=8.53973422267356774285) errorit("QD_check failed at x[0]"); if (x[1]!=-6.7738152905024242803e-16) errorit("QD_check failed at x[1]"); if (x[2]!=1.6082340642907151632e-32) errorit("QD_check failed at x[2]"); return 0; }
Minimal code to demonstrate the problem in qd.i. The problematic flag seems to be -fexpensive-optimizations. $ gcc qd.i -O1 -o qd && ./qd 0x1.114580b45d474p+3 $ gcc qd.i -O1 -fexpensive-optimizations -o qd && ./qd 0x1.114580b45d475p+3
Created attachment 24161 [details] Testcase
The difference in results is due to the use of the fma instructions (fused multiply add). If you don't want the fma instructions to be used you should use the -ffp-contract=on flag. Otherwise GCC will use them and accept the slightly different floating point results due to no rounding being done between the multiply and the add in the fma instruction. GCC will also do other optimizations that may change the results of inexact floating point instructions. I am resolving this as invalid since this change in the least significant bit of an inexact floating point value is reasonable without fp-contract being on.
(In reply to comment #3) > The difference in results is due to the use of the fma instructions (fused > multiply add). If you don't want the fma instructions to be used you should > use the -ffp-contract=on flag. Thanks for the reply, I understand your reasoning. However, will this option guarantee strict IEEE floating-point behaviour or are there other flags to be specified? I have always been under the impression that *by default* gcc produced code respecting the floating-point standard unless one specifies something like -ffast-math. It seems this has changed in recent GCC versions.
I am not sure any set of flags will guarantee strict IEEE floating point behavior. See PR37845 for some more information.