c-随机正态分布生成器比较



因此,我为一个依赖于生成正态分布随机数的项目编写了几个随机数生成器。我已经写了几个不同的生成器实现,即Box-Muller方法、Marsaglia的Polar方法和逆累积分布近似方法。我比较了它们的速度,结果发现逆方法是三种方法中最快的,这是意料之中的吗,还是我在写另外两种时搞砸了?我知道numpy使用Polar方法已经很长时间了,所以我相信它应该是三种方法中最快的?

我使用gcc9.3.0进行编译,并使用-O3标志。

以下是生成器的代码:

struct gaussGenState 
{
float gauss;
int has_gauss;
};
void initializeGauss(struct gaussGenState *state)
{
state->has_gauss = 0;
state->gauss = 0.0;
}
float gauss(struct gaussGenState *state)
{
/*
Implementation of Marsaglia's polar method for calculating normally distributed 
gaussian variables
seeding of rand needs to be done outside of function (with srand())
*/
if (state->has_gauss){
const float temp = state->gauss;
state->has_gauss = 0;
state->gauss = 0.0;
return temp;
}   
else {
float f, x1, x2, r2;
do {
x1 = ((float)rand() / RAND_MAX) * 2 - 1;
x2 = ((float)rand() / RAND_MAX) * 2 - 1;
r2 = x1 * x1 + x2 * x2;
} while (r2 >= 1.0 || r2 == 0.0);
f = sqrt(-2.0 * log(r2) / r2);
state->gauss = f * x1;
state->has_gauss = 1;
return f * x2;
}
}
float gaussbm(struct gaussGenState *state)
{
/*
Implementation of Box-Muller method for calculating normally distributed gaussian 
variables
seeding of rand needs to be done outside of function (with srand())
*/
if (state->has_gauss){
const float temp = state->gauss;
state->has_gauss = 0;
state->gauss = 0.0;
return temp;
}
else {
float u, v, f1, f2;
u = ((float)rand() / RAND_MAX);
v = ((float)rand() / RAND_MAX);
f1 = sqrt(-2.0 * log(u));
f2 = 2*M_PI*v;
state->gauss = f1 * cos(f2);
state->has_gauss = 1;
return f1 * sin(f2);
}
}
float gaussInv(void)
{
/*
Implementation of Inverse cumulative distribution method for calculating normally 
distributed gaussian variables
Approximation relative error less than 1.15 x 10e-9 in the entire region.
seeding of rand needs to be done outside of function (with srand())
*/
float p, q, r;
float a[6] = {-3.969683028665376e+01,  2.209460984245205e+02,
-2.759285104469687e+02,  1.383577518672690e+02,
-3.066479806614716e+01,  2.506628277459239e+00};
float b[5] = {-5.447609879822406e+01,  1.615858368580409e+02,
-1.556989798598866e+02,  6.680131188771972e+01,
-1.328068155288572e+01};
float c[6] = {-7.784894002430293e-03, -3.223964580411365e-01,
-2.400758277161838e+00, -2.549732539343734e+00,
4.374664141464968e+00,  2.938163982698783e+00};
float d[4] = { 7.784695709041462e-03,  3.224671290700398e-01,
2.445134137142996e+00,  3.754408661907416e+00};
p = ((float)rand()/RAND_MAX);
if (p < 0.02425){
q = sqrt(-2*log(p));
return ((((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) /
((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1));
}
if ((1-0.02425) < p){
q = sqrt(-2*log(1-p));
return -((((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / 
((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1));
}
q = p - 0.5;
r = q*q;
return ((((((a[0]*r+a[1])*r+a[2])*r+a[3])*r+a[4])*r+a[5])*q /
(((((b[0]*r+b[1])*r+b[2])*r+b[3])*r+b[4])*r+1));
}

首先,rand的实现通常非常缓慢,使用RAND_MAX慢速除法float强制转换也无济于事。因此,gaussgaussbm实现(至少调用rand两次(实际上比gaussInv(调用rand一次(慢并不奇怪。

此外,gauss之所以慢,也是因为while循环的可预测性较差(处理器在可预测循环和条件上通常更快(,而gaussbm之所以慢,也因为昂贵的cos/sin三角函数

虽然gaussInv应该比另外两个更快,但它仍然可以改进。一种方法是使用矢量化和自定义优化的随机函数。事实上,由于SIMD指令,大多数处理器一次可以处理多个浮点,并且此函数可以利用它们(尽管这并不简单(。大多数主流x86-64处理器可以在一行8个浮点上工作(使用AVX/AVX2(,很少有最近的处理器甚至可以计算一行16个浮点(使用AVX-512(。注意,gaussbm也可以被矢量化。

对于标量实现,查找表可用于加速计算,尽管最快标量实现的吞吐量可能明显低于现代主流处理器上任何优化的矢量化实现的吞吐量。

最新更新