Example: static float sRGB_case4(float x) { // import std.math : exp; return 1.055f * expImpl(x) - 0.055f; // expImpl not inlined by default // (inlined when using pragma(inline, true), but that fails to inline in DMD) } // pragma(inline, true) // This is borrowed from phobos/exponential.d to help gcc inline it fully. // Only T == float case is here (as some traits are private to phobos). // Also isNaN and range checks are removed, as sRGB performs own checks. static private T expImpl(T)(T x) @safe pure nothrow @nogc { //import std.math : floatTraits, RealFormat; //import std.math.traits : isNaN; //import std.math.rounding : floor; //import std.math.algebraic : poly; //import std.math.constants : LOG2E; import std.math; import core.math; static immutable T[6] P = [ 5.0000001201E-1, 1.6666665459E-1, 4.1665795894E-2, 8.3334519073E-3, 1.3981999507E-3, 1.9875691500E-4, ]; enum T C1 = 0.693359375; enum T C2 = -2.12194440e-4; // Overflow and Underflow limits. enum T OF = 88.72283905206835; enum T UF = -103.278929903431851103; // ln(2^-149) // Special cases. //if (isNaN(x)) // return x; //if (x > OF) // return real.infinity; //if (x < UF) // return 0.0; // Express: e^^x = e^^g * 2^^n // = e^^g * e^^(n * LOG2E) // = e^^(g + n * LOG2E) T xx = floor((cast(T) LOG2E) * x + cast(T) 0.5); // NOT INLINED! const int n = cast(int) xx; x -= xx * C1; x -= xx * C2; xx = x * x; x = poly(x, P) * xx + x + 1.0f; // poly is generated optimally, but not inlined // Scale by power of 2. x = core.math.ldexp(x, n); // NOT INLINED return x; } gdc gdc (Compiler-Explorer-Build-gcc-454a4d5041f53cd1f7d902f6c0017b7ce95b36df-binutils-2.38) 13.0.1 20230318 (experimental) gdc -O3 -march=znver2 -frelease -fbounds-check=off pure nothrow @nogc @safe float std.math.algebraic.poly!(float, float, 6).poly(float, ref const(float[6])): vmovss xmm1, DWORD PTR [rdi+20] vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+16] vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+12] vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+8] vfmadd213ss xmm1, xmm0, DWORD PTR [rdi+4] vfmadd213ss xmm0, xmm1, DWORD PTR [rdi] ret pure nothrow @nogc @safe float example.expImpl!(float).expImpl(float): push rbx vmovaps xmm1, xmm0 sub rsp, 16 vmovss xmm0, DWORD PTR .LC0[rip] vfmadd213ss xmm0, xmm1, DWORD PTR .LC1[rip] vmovss DWORD PTR [rsp+8], xmm1 call pure nothrow @nogc @trusted float std.math.rounding.floor(float) vmovss xmm1, DWORD PTR [rsp+8] mov edi, OFFSET FLAT:immutable(float[6]) example.expImpl!(float).expImpl(float).P vfnmadd231ss xmm1, xmm0, DWORD PTR .LC2[rip] vmovss DWORD PTR [rsp+12], xmm0 vfnmadd231ss xmm1, xmm0, DWORD PTR .LC3[rip] vmulss xmm3, xmm1, xmm1 vmovaps xmm0, xmm1 vmovss DWORD PTR [rsp+8], xmm1 vmovd ebx, xmm3 call pure nothrow @nogc @safe float std.math.algebraic.poly!(float, float, 6).poly(float, ref const(float[6])) vmovss xmm1, DWORD PTR [rsp+8] vmovd xmm4, ebx vmovss xmm2, DWORD PTR [rsp+12] vfmadd132ss xmm0, xmm1, xmm4 vaddss xmm0, xmm0, DWORD PTR .LC4[rip] add rsp, 16 pop rbx vcvttss2si edi, xmm2 jmp ldexpf float example.sRGB_case4(float): sub rsp, 8 call pure nothrow @nogc @safe float example.expImpl!(float).expImpl(float) vmovss xmm1, DWORD PTR .LC6[rip] vfmadd132ss xmm0, xmm1, DWORD PTR .LC5[rip] add rsp, 8 ret https://godbolt.org/z/YMoMPdjn5 Additionally std.math.exp itself, is never inlined by gcc. This is important, as some early checks (isNaN, OF, UF checks) in exp could be removed by proper inlining.
PS. LDC 1.23.0 - 1.32.0 produce optimal code. LDC 1.22.0 a bit worse (due to use of x87 codegen), and 1.21 and older fail to inline `ldexp`, but still inline `poly` and `floor` perfectly.
Interesting enough, GDC 10.2 does inline `poly` instantiation with all the constants.
Even for C/C++, GCC does not inline: float f(float a, int b) { return __builtin_ldexpf(a,b); } So ...
With -ffast-math -mfpmath=387,sse (or -mavx512f instead of -mfpmath=387 as there is a avx512f instruction too) added, ldexp gets inlined.
(In reply to Andrew Pinski from comment #4) > With -ffast-math -mfpmath=387,sse (or -mavx512f instead of -mfpmath=387 as > there is a avx512f instruction too) added, ldexp gets inlined. Note, vscaless accept a float32 operand for exp which is int32 in ldexpf, there maybe some precision loss to convert an int32 to float32, that's why Ofast is needed here.