这段文字基于这样一个前提条件:所讨论的平台遵循IEEE-754(2008)浮点数标准,在此标准下,float
类型被映射到IEEE-754的binary32
格式,并且32位整型数据与float
类型在字节顺序上采用相同的方式。同时,我们假定存在一个C语言工具链,可以通过设置适当的命令行选项(如必要)来维护IEEE-754语义。作者使用了Intel C/C++编译器,并带有-march=skylake-avx152 -O3 -fp-model=precise
这些选项。
鉴于互补误差函数(erfc(x))关于(0,1)对称,我们可以专注于函数在正半平面上的输入。在这一区间内,函数的衰减特性大致类似于exp(-x²),并且在使用float
类型进行计算时,对于大于10.5的x值,由于下溢会导致结果趋近于零。若在[0, 10.5]范围内绘制erfc(x) / exp(-x²)图形,可以看出这个表达式的形状较难直接用多项式来准确逼近,但非常适合用有理函数,即两个多项式的比值来进行逼近。初步试验表明,每个多项式均为三次的两个多项式应当足以实现五位有效数字的精度。
尽管市面上有很多工具可以生成多项式近似,但在生成有理函数近似方面却不太常见。为此,作者采用了Remez算法的一种改进版本,首先生成了erfc(x) / exp(-x²)的一个初步的最小化最大误差(minimax)逼近R(x) = P(x)/Q(x),然后又进行了较为广泛的经验性搜索,最终找到了一个能够提供接近等幅震荡误差峰且几乎达到10⁻⁵相对误差的逼近,而对于作者的需求来说,剩余的差异是可以忽略不计的。
通过计算erfc(x) = exp(-x²) * R(x),显然获得的精度取决于平台expf()
函数实现的准确性。根据作者的经验,该函数的忠实四舍五入实现(最大误差≤1 ulp)是比较普遍的。尽管Intel编译器附带的高性能数学库提供了接近正确四舍五入(最大误差非常接近0.5 ulps)的expf()
实现,但作者也尝试了自己的忠实四舍五入替代版本my_expf()
,即使其误差稍大一些,但对fast_erfcf()
函数的精度影响也非常微小。
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#define USE_FMA (1)
#define USE_BUILTIN_EXP (0)
#if !USE_BUILTIN_EXP
float my_expf (float a);
#endif // USE_BUILTIN_EXP
/* Fast computation of the complementary error function. For argument x > 0
erfc(x) = exp(-x*x) * P(x) / Q(x), where P(x) and Q(x) are polynomials.
If expf() is faithfully rounded, the following error bounds should hold:
Maximum relative error < 1.065e-5, maximum absolute error < 9.50e-6, and
maximum ulp error < 176.5
*/
float fast_erfcf (float x)
{
float a, c, e, p, q, r, s;
a = fabsf (x);
c = fminf (a, 10.5f);
s = -c * c;
#if USE_BUILTIN_EXP
e = expf (s);
#else // USE_BUILTIN_EXP
e = my_expf (s);
#endif // USE_BUILTIN_EXP
#if USE_FMA
q = 3.82346243e-1f; // 0x1.8785c6p-2
p = -4.38094139e-5f; // -0x1.6f8000p-15
q = fmaf (q, c, 1.30382288e+0f); // 0x1.4dc756p+0
p = fmaf (p, c, 2.16852024e-1f); // 0x1.bc1ceap-3
q = fmaf (q, c, 1.85278833e+0f); // 0x1.da5056p+0
p = fmaf (p, c, 7.23953605e-1f); // 0x1.72aa0cp-1
q = fmaf (q, c, 9.99991655e-1f); // 0x1.fffee8p-1
p = fmaf (p, c, 1.00000000e+0f); // 0x1.000000p+0
#else // USE_FMA
q = 3.82346272e-1f; // 0x1.8785c8p-2f
p = -4.38243151e-5f; // -0x1.6fa000p-15
q = q * c + 1.30382371e+0f; // 0x1.4dc764p+0
p = p * c + 2.16852218e-1f; // 0x1.bc1d04p-3
q = q * c + 1.85278797e+0f; // 0x1.da5050p+0
p = p * c + 7.23953605e-1f; // 0x1.72aa0cp-1
q = q * c + 9.99991596e-1f; // 0x1.fffee6p-1
p = p * c + 1.00000000e+0f; // 0x1.000000p+0
#endif // USE_FMA
r = e / q;
r = r * p;
if (x < 0.0f) r = 2.0f - r;
if (isnan(x)) r = x + x;
return r;
}
float uint32_as_float (uint32_t a)
{
float r;
memcpy (&r, &a, sizeof r);
return r;
}
/* Exponential function. Maximum error 0.86565 ulps */
float my_expf (float a)
{
float f, r, j, s, t;
int i;
unsigned int ia;
// exp(a) = 2**i * exp(f); i = rintf (a / log(2))
j = fmaf (1.442695f, a, 12582912.f); // 0x1.715476p0 // log2(e)
j = j - 12582912.f; // 0x1.8p23 // 2**23+2**22
f = fmaf (j, -6.93145752e-1f, a); // -0x1.62e400p-1 // log_2_hi
f = fmaf (j, -1.42860677e-6f, f); // -0x1.7f7d1cp-20 // log_2_lo
i = (int)j;
// approximate r = exp(f) on interval [-log(2)/2, +log(2)/2]
r = 1.37805939e-3f; // 0x1.694000p-10
r = fmaf (r, f, 8.37312452e-3f); // 0x1.125edcp-7
r = fmaf (r, f, 4.16695364e-2f); // 0x1.555b5ap-5
r = fmaf (r, f, 1.66664720e-1f); // 0x1.555450p-3
r = fmaf (r, f, 4.99999851e-1f); // 0x1.fffff6p-2
r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
// exp(a) = 2**i * r
ia = (i > 0) ? 0 : 0x83000000;
s = uint32_as_float (0x7f000000 + ia);
t = uint32_as_float ((i << 23) - ia);
r = r * s;
r = r * t;
// handle special cases: severe overflow / underflow
if (fabsf (a) >= 104.0f) r = (a < 0) ? 0.0f : INFINITY;
return r;
}
uint32_t float_as_uint32 (float a)
{
uint32_t r;
memcpy (&r, &a, sizeof r);
return r;
}
uint64_t double_as_uint64 (double a)
{
uint64_t r;
memcpy (&r, &a, sizeof r);
return r;
}
double floatUlpErr (float res, double ref)
{
uint64_t refi, i, j, err;
int expoRef;
/* ulp error cannot be computed if either operand is NaN, infinity, zero */
if (isnan (res) || isnan (ref) || isinf(res) || isinf (ref) ||
(res == 0.0f) || (ref == 0.0)) {
return 0.0;
}
i = ((int64_t)float_as_uint32 (res)) << 32;
expoRef = (int)(((double_as_uint64 (ref) >> 52) & 0x7ff) - 1023);
refi = double_as_uint64 (ref);
if (expoRef >= 129) {
j = (refi & 0x8000000000000000ULL) | 0x7fffffffffffffffULL;
} else if (expoRef < -126) {
j = ((refi << 11) | 0x8000000000000000ULL) >> 8;
j = j >> (-(expoRef + 126));
j = j | (refi & 0x8000000000000000ULL);
} else {
j = ((refi << 11) & 0x7fffffffffffffffULL) >> 8;
j = j | ((uint64_t)(expoRef + 127) << 55);
j = j | (refi & 0x8000000000000000ULL);
}
err = (i < j) ? (j - i) : (i - j);
return err / 4294967296.0;
}
int main (void)
{
uint32_t argi, resi, refi, diff;
float arg, res, reff, abserrloc = NAN, relerrloc = NAN, ulperrloc = NAN;
double ref, relerr, abserr, ulperr;
double maxabserr = 0, maxrelerr = 0, maxulperr = 0;
argi = 0;
do {
arg = uint32_as_float (argi);
ref = erfc ((double)arg);
res = fast_erfcf (arg);
reff = (float)ref;
resi = float_as_uint32 (res);
refi = float_as_uint32 (reff);
ulperr = floatUlpErr (res, ref);
if (ulperr > maxulperr) {
ulperrloc = arg;
maxulperr = ulperr;
}
abserr = fabs (res - ref);
if (abserr > maxabserr) {
abserrloc = arg;
maxabserr = abserr;
}
if (fabs (ref) >= 0x1.0p-126) {
relerr = fabs ((res - ref) / ref);
if (relerr > maxrelerr) {
relerrloc = arg;
maxrelerr = relerr;
}
}
diff = (resi > refi) ? (resi - refi) : (refi - resi);
if (diff > 200) {
printf ("diff=%u @ %15.8e : res=% 15.8e ref=% 15.8e\n",
diff, arg, res, ref);
return EXIT_FAILURE;
}
argi++;
} while (argi);
printf ("max rel err = %.6e @ % 15.8e\n"
"max abs err = %.6e @ % 15.8e\n"
"max ulp err = %.6e @ % 15.8e\n",
maxrelerr, relerrloc,
maxabserr, abserrloc,
maxulperr, ulperrloc);
return EXIT_SUCCESS;
}