海波(a,b)倒数的高效精确计算



Givens旋转为实现QR分解提供了一种健壮且易于并行的方法。Givens旋转需要计算旋转角度的正弦和余弦分量。在实际计算的情况下,这通常涉及hypot()函数的倒数的计算,以规范化两个向量,例如维基百科中所示。

虽然这避免了中间计算中的大多数上溢和下溢情况,但对于非常大的值abhypot(a,b)可能上溢到无穷大,而1/√(a2+b2(实际上可以表示为亚正规浮点数。此外,除法的使用增加了进一步的计算成本,这在具有慢速浮点除法的平台上可能是显著的。

因此,以类似于标准hypot()函数的成本直接计算1/√(A2+b2(的函数rhypot(a,b)是可取的。准确度应与计算CCD_ 7的朴素方法相同或更好。使用正确取整的hypot函数,该表达式的最大误差为1.5 ulps。

如何才能高效、准确地实现这样的功能?可以假设IEEE-754二进制浮点运算的使用以及对融合乘加(FMA(运算的本地硬件支持的可用性。为了便于说明和测试,我们可以限制为单精度计算,即IEEE-754binary32格式。

在下面,我将展示ISO-C99代码,该代码以良好的精度和性能实现rhypot。通用算法是直接从我在这个答案中展示的hypot的示例实现中导出的。对于hypot,确定自变量中幅度最大的值,然后找到将该值映射到单位附近的比例因子(出于准确性的原因,为2的幂(。将比例因子应用于两个自变量,然后用sqrt函数计算该变换后的2向量的长度,最后用";比例因子的倒数。缩放依赖于实际的乘法,因为自变量可能是无法单独通过简单的指数操作正确缩放的子规范。

对于rhypot,只需要两个更改:必须使用倒数平方根函数rsqrt而不是sqrt,并且输入缩放和结果缩放使用相同的缩放因子。

一些计算环境提供rsqrt()功能,该功能计划包含在未来版本的ISO C标准(ISO/IEC TS 18661-4:2015(中。对于不提供倒数平方根函数的环境,我将展示一些可移植的(在问题中所述的平台要求内(和机器特定的实现。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
uint32_t __float_as_uint32 (float a)
{
uint32_t r;
memcpy (&r, &a, sizeof r);
return r;
}
float __uint32_as_float (uint32_t a)
{
float r;
memcpy (&r, &a, sizeof r);
return r;
}
float my_rsqrtf (float);
/* Compute the reciprocal of sqrt (a**2 + b**2), avoiding premature overflow
and underflow in intermediate computation. The accuracy of this function
depends on the accuracy of the reciprocal square root implementation used. 
With the rsqrtf() implementations shown below, the following maximum ulp 
error was observed for 2**36 random test cases:
CORRECTLY_ROUNDED       1.20736973
SSE_HALLEY              1.33120522 
SSE_2NR                 1.42086841
SQRT_OOX                1.42906701
BIT_TWIDDLE_3NR         1.43062950
ITO_TAKAGI_YAJIMA_1NR   1.43681737
BIT_TWIDDLE_NR_HALLEY   1.47485797
*/
float my_rhypotf (float a, float b)
{
float fa, fb, mn, mx, scale, s, w, res;
uint32_t expo;
/* sort arguments by magnitude */
fa = fabsf (a);
fb = fabsf (b);
mx = fmaxf (fa, fb);
mn = fminf (fa, fb);
/* compute scale factor */
expo = __float_as_uint32 (mx) & 0xfc000000;
scale = __uint32_as_float (0x7e000000 - expo);
/* scale operand of maximum magnitude towards unity */
mn = mn * scale;
mx = mx * scale;
/* mx in [2**-23, 2**6) */
s = fmaf (mx, mx, mn * mn); // 0.75 ulp
w = my_rsqrtf (s);
/* reverse previous scaling */
res = w * scale;
/* handle special cases */
float t = a + b;
if (!(fabsf (t) <= INFINITY)) res = t; // isnan(t)
if (mx == INFINITY) res = 0.0f; // isinf(mx)
return res;
}
#define CORRECTLY_ROUNDED     (1)
#define SSE_HALLEY            (2)
#define SSE_2NR               (3)
#define ITO_TAKAGI_YAJIMA_1NR (4)
#define SQRT_OOX              (5)
#define BIT_TWIDDLE_3NR       (6)
#define BIT_TWIDDLE_NR_HALLEY (7)
#define RSQRT_VARIANT (SSE_HALLEY)
#if (RSQRT_VARIANT == SSE_2NR) || (RSQRT_VARIANT == SSE_HALLEY)
#include "immintrin.h"
#endif // (RSQRT_VARIANT == SSE_2NR) || (RSQRT_VARIANT == SSE_HALLEY)
float my_rsqrtf (float a)
{
#if RSQRT_VARIANT == CORRECTLY_ROUNDED
float r = (float) sqrt (1.0/(double)a);
#elif RSQRT_VARIANT == SQRT_OOX
float r = sqrtf (1.0f / a);
#elif RSQRT_VARIANT == SSE_2NR
float r;
/* compute initial approximation */
_mm_store_ss (&r, _mm_rsqrt_ss (_mm_set_ss (a)));
/* refine approximation using two Newton-Raphson iterations */
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
#elif RSQRT_VARIANT == SSE_HALLEY
float e, r;
/* compute initial approximation */
_mm_store_ss (&r, _mm_rsqrt_ss (_mm_set_ss (a)));
/* refine approximation using Halley iteration with cubic convergence */
e = fmaf (r * r, -a, 1.0f);
r = fmaf (fmaf (0.375f, e, 0.5f), e * r, r);
#elif RSQRT_VARIANT == BIT_TWIDDLE_3NR
float r;
/* compute initial approximation */
r = __uint32_as_float (0x5f375b0d - (__float_as_uint32(a) >> 1));
/* refine approximation using three Newton-Raphson iterations */
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
#elif RSQRT_VARIANT == BIT_TWIDDLE_NR_HALLEY
float e, r;
/* compute initial approximation */
r = __uint32_as_float (0x5f375b0d - (__float_as_uint32(a) >> 1));
/* refine approximation using Newton-Raphson iteration */
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
/* refine approximation using Halley iteration with cubic convergence */
e = fmaf (r * r, -a, 1.0f);
r = fmaf (fmaf (0.375f, e, 0.5f), e * r, r);
#elif RSQRT_VARIANT == ITO_TAKAGI_YAJIMA_1NR
/* Masayuki Ito, Naofumi Takagi, Shuzo Yajima, "Efficient Initial 
Approximation for Multiplicative Division and Square Root by a 
Multiplication with Operand Modification". IEEE Transactions on 
Computers, Vol. 46, No. 4, April 1997, pp. 495-498.
*/
#define TAB_INDEX_BITS     (7)
#define TAB_ENTRY_BITS     (16)
#define TAB_ENTRIES        (1 << TAB_INDEX_BITS)
#define FP32_EXPO_BIAS     (127)
#define FP32_MANT_BITS     (23)
#define FP32_SIGN_MASK     (0x80000000)
#define FP32_EXPO_MASK     (0x7f800000)
#define FP32_EXPO_LSB_MASK (1u << FP32_MANT_BITS)
#define FP32_INDEX_MASK    (((1u << TAB_INDEX_BITS) - 1) << (FP32_MANT_BITS - TAB_INDEX_BITS))
#define FP32_XHAT_MASK     (~(FP32_INDEX_MASK | FP32_SIGN_MASK) | FP32_EXPO_MASK)
#define FP32_FLIP_BIT_MASK (3u << (FP32_MANT_BITS - TAB_INDEX_BITS - 1))
#define FP32_ONE_HALF      (0x3f000000)
const uint16_t d1tab [TAB_ENTRIES] = {
0xb2ec, 0xaed7, 0xaae9, 0xa720, 0xa37b, 0x9ff7, 0x9c93, 0x994d,
0x9623, 0x9316, 0x9022, 0x8d47, 0x8a85, 0x87d8, 0x8542, 0x82c0,
0x8053, 0x7bf0, 0x775f, 0x72f1, 0x6ea4, 0x6a77, 0x666a, 0x6279,
0x5ea5, 0x5aed, 0x574e, 0x53c9, 0x505d, 0x4d07, 0x49c8, 0x469e,
0x438a, 0x408a, 0x3d9e, 0x3ac4, 0x37fc, 0x3546, 0x32a0, 0x300b,
0x2d86, 0x2b10, 0x28a8, 0x264f, 0x2404, 0x21c6, 0x1f95, 0x1d70,
0x1b58, 0x194c, 0x174b, 0x1555, 0x136a, 0x1189, 0x0fb2, 0x0de6,
0x0c22, 0x0a68, 0x08b7, 0x070f, 0x056f, 0x03d8, 0x0249, 0x00c1,
0xfd08, 0xf742, 0xf1b4, 0xec5a, 0xe732, 0xe239, 0xdd6d, 0xd8cc,
0xd454, 0xd002, 0xcbd6, 0xc7cd, 0xc3e5, 0xc01d, 0xbc75, 0xb8e9,
0xb57a, 0xb225, 0xaeeb, 0xabc9, 0xa8be, 0xa5cb, 0xa2ed, 0xa024,
0x9d6f, 0x9ace, 0x983e, 0x95c1, 0x9355, 0x90fa, 0x8eae, 0x8c72,
0x8a45, 0x8825, 0x8614, 0x8410, 0x8219, 0x802e, 0x7c9c, 0x78f5,
0x7565, 0x71eb, 0x6e85, 0x6b31, 0x67f3, 0x64c7, 0x61ae, 0x5ea7,
0x5bb0, 0x58cb, 0x55f6, 0x5330, 0x5079, 0x4dd1, 0x4b38, 0x48ad,
0x462f, 0x43be, 0x4159, 0x3f01, 0x3cb5, 0x3a75, 0x3840, 0x3616
};
uint32_t arg, idx, d1, xhat;
float r;
arg = __float_as_uint32 (a);
idx = (arg >> ((FP32_MANT_BITS + 1) - TAB_INDEX_BITS)) & ((1u << TAB_INDEX_BITS) - 1); 
d1 = FP32_ONE_HALF | (d1tab[idx] << ((FP32_MANT_BITS + 1) - TAB_ENTRY_BITS));
xhat = ((arg & FP32_INDEX_MASK) | (((((3 * FP32_EXPO_BIAS) << FP32_MANT_BITS) + ~arg) >> 1) & FP32_XHAT_MASK)) ^ FP32_FLIP_BIT_MASK;
/* compute initial approximation, accurate to about 14 bits */
r = __uint32_as_float (d1) * __uint32_as_float (xhat);
/* refine approximation with one Newton-Raphson iteration */
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
#else
#error unsupported RSQRT_VARIANT
#endif // RSQRT_VARIANT
return r;
}
uint64_t __double_as_uint64 (double a)
{
uint64_t r;
memcpy (&r, &a, sizeof r);
return r;
}
double floatUlpErr (float res, double ref)
{
uint64_t i, j, err, refi;
int expoRef;

/* ulp error cannot be computed if either operand is NaN, infinity, zero */
if (isnan (res) || isnan (ref) || isinf (res) || isinf (ref) ||
(res == 0.0f) || (ref == 0.0f)) {
return 0.0;
}
/* Convert the float result to an "extended float". This is like a float
with 56 instead of 24 effective mantissa bits.
*/
i = ((uint64_t)__float_as_uint32(res)) << 32;
/* Convert the double reference to an "extended float". If the reference is
>= 2^129, we need to clamp to the maximum "extended float". If reference
is < 2^-126, we need to denormalize because of the float types's limited
exponent range.
*/
refi = __double_as_uint64(ref);
expoRef = (int)(((refi >> 52) & 0x7ff) - 1023);
if (expoRef >= 129) {
j = 0x7fffffffffffffffULL;
} else if (expoRef < -126) {
j = ((refi << 11) | 0x8000000000000000ULL) >> 8;
j = j >> (-(expoRef + 126));
} else {
j = ((refi << 11) & 0x7fffffffffffffffULL) >> 8;
j = j | ((uint64_t)(expoRef + 127) << 55);
}
j = j | (refi & 0x8000000000000000ULL);
err = (i < j) ? (j - i) : (i - j);
return err / 4294967296.0;
}
double rhypot (double a, double b)
{
return 1.0 / hypot (a, b);
}
// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)
#define FP32_QNAN_BIT (0x00400000)
int main (void)
{
float af, bf, resf, reff;
uint32_t ai, bi, resi, refi;
double ref, err, maxerr = 0;
uint64_t diff, diffsum = 0, count = 1ULL << 36;

do {
ai = KISS;
bi = KISS;
af = __uint32_as_float (ai);
bf = __uint32_as_float (bi);
resf = my_rhypotf (af, bf);
ref = rhypot ((double)af, (double)bf);
reff = (float)ref;
refi = __float_as_uint32 (reff);
resi = __float_as_uint32 (resf);
diff = llabs ((long long int)resi - (long long int)refi);
/* If both inputs are a NaN, result can be either argument, converted
to QNaN if necessary. If one input is NaN and the other not infinity
the NaN input must be returned, converted to QNaN if necessary. If
one input is infinity, zero must be returned even if the other input
is a NaN. In all other cases allow up to 1 ulp of difference.
*/
if ((isnan (af) && isnan (bf) && (resi != (ai | FP32_QNAN_BIT)) && (resi != (bi | FP32_QNAN_BIT))) ||
(isnan (af) && !isinf (bf) && !isnan (bf) && (resi != (ai | FP32_QNAN_BIT))) ||
(isnan (bf) && !isinf (af) && !isnan (af) && (resi != (bi | FP32_QNAN_BIT))) ||
(isinf (af) && (resi != 0)) ||
(isinf (bf) && (resi != 0)) ||
(diff > 1)) {
printf ("err @ (%08x,%08x): res= %08x (%15.8e) ref=%08x (%15.8e)n",
ai, bi, resi, resf, refi, reff);
return EXIT_FAILURE;
}
diffsum += diff;
err = floatUlpErr (resf, ref);
if (err > maxerr) {
printf ("ulp=%.8f @ (% 15.8e, % 15.8e): res=%15.6a  ref=%22.13an", 
err, af, bf, resf, ref);
maxerr = err;
}
count--;
} while (count);
printf ("diffsum = %llun", diffsum);
return EXIT_SUCCESS;
}

最新更新