在学习cuda时,我正在研究一个微型项目来计算移动平均值。虽然我简单的移动平均值(SMA(效果很好(尽管缓慢且不优化(,但我的指数移动平均值(EMA(总是会导致数字不正确。
我已经发现问题是*(ema + i - 1)
始终是0。相同的数组访问概念在测试C 文件中很好地工作得很好,但在我的CUDA应用程序中却没有。我认为我只是不知道有关指针或cuda的概念。
using namespace std;
// simple_ma not included
void __global__ exponential_ma(int n, int period, float *data, float *ema){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i == 0){
*ema = *data;
}else if(i < n){
float k = 2.0f/(period+1);
*(ema + i) = *(data + i)*k + *(ema + i - 1) * (1.0f-k);
// PROBLEM OCCURS ON THE LINE ABOVE, neither does ema[i-1] work
}
}
int main(){
/**
* Function that computes a moving average on a vector
*/
int N = 1<<5; // data size
cout << "N = " << N << " bytes = " << N*sizeof(float) << endl;
int period = 10; // moving average period
// malloc'ed for stack usage instead of small heap size
float *data = (float*)malloc(N*sizeof(float));
float *sma = (float*)malloc(N*sizeof(float));
float *ema = (float*)malloc(N*sizeof(float));
float *d_data; // device pointer for data
float *d_sma; // device pointer for simple moving average
float *d_ema; // device pointer for exponential moving average
// CUDA allocate memory for data, SMA, and EMA
cudaMalloc(&d_data, N*sizeof(float));
cudaMalloc(&d_sma, N*sizeof(float));
cudaMalloc(&d_ema, N*sizeof(float));
// initialize data
srand(time(0));
data[0] = rand() % 100 + 50;
for(int i = 1; i < N; i++){
data[i] = data[i-1] + rand() % 11 - 5;
}
// copy data from host to device
cudaMemcpy(d_data, data, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_sma, sma, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_ema, ema, N*sizeof(float), cudaMemcpyHostToDevice);
// call device function
simple_ma<<<(N+255)/256, 256>>>(N, period, d_data, d_sma);
exponential_ma<<<(N+255)/256, 256>>>(N, period, d_data, d_ema);
cudaMemcpy(sma, d_sma, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(ema, d_ema, N*sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0; i < N; i += 1){
cout << "i = " << i << " data = "<< data[i] << " ---sma---> " << sma[i] << " ---ema---> " << ema[i] << endl;
}
cudaFree(d_data);
cudaFree(d_sma);
cudaFree(d_ema);
return 0;
}
cuda中的线程可以按任何顺序执行。ema[i-1]
的计算可能不会是在您尝试在另一个线程中计算ema[i]
的时候(这取决于ema[i-1]
的计算已完成(。您用于简单串行实现此算法的方法将无法以线程并行的方式
考虑到这一点,这是一种可能的方法。
首先,重铸您的递归EMA计算:
ema[0] = data[0]
i>0: ema[i] = k*data[i]+(1-k)*ema[i-1]
以非恢复形式:
ema[0] = data[0]
i>0: ema[i] = ((1-k)^i)*data[0] + ∑(((1-k)^x)*k*data[i-x])
x=0..i-1
这将通知我们如何编写CUDA内核代码。如果这种转换对您来说似乎是晦涩的,那么您可能希望创建一个序列的前几个条目的表,类似于此答案中描述的方法。
它有效,但是每个线程都在整个输入数组中迭代到其索引。将有一个螺纹块(具有最高数组索引(需要比所有其他线程更长的时间。最坏的情况线程与串行版本大致相同的工作,因此不是一个非常有趣的并行实现。
为了解决这个问题,我们可以观察到非恢复形式方程。根据您的代码,术语(1.0 - k)
始终小于1,因为k
的2除以一些大于2的正整数(即,我们假设period
为2或更大(。因此,随着总和的进行,(1.0 - k)^x
术语最终变得很小。我们还将假设您的数据在范围内大致按照您的表现。在这种情况下,随着求和的进行,最终求和的条款对float
总数没有明显影响。因此,当我们的(1.0 - k)^x
项变得足够小,无法对结果产生重大影响时,我们将减少循环处理。
使用这些假设和修改,我们可以创建一个比Naive Serial CPU版本更快的CUDA代码,同时保持较小的错误余量。
$ cat t1444.cu
#include <iostream>
#include <cstdio>
#define cudaCheckErrors(msg)
do {
cudaError_t __err = cudaGetLastError();
if (__err != cudaSuccess) {
fprintf(stderr, "Fatal error: %s (%s at %s:%d)n",
msg, cudaGetErrorString(__err),
__FILE__, __LINE__);
fprintf(stderr, "*** FAILED - ABORTINGn");
exit(1);
}
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void gpu_ema(const int n, const float k, const float * __restrict__ data, float * __restrict__ ema, const float tol){
int i = blockIdx.x*blockDim.x+threadIdx.x;
if (i == 0) ema[0] = data[0];
else if (i < n){
float sum = 0;
float fac = 1.0f - k;
float m = 1.0f;
int j;
for (j = 0; j < i; j++){
sum += m*k*data[i-j];
m *= fac;
if (m < tol) break; // early exit validity depends on a variety of assumptions
}
if (j == i) sum += m*data[0];
ema[i] = sum;
}
}
void cpu_ema(int n, int period, float *data, float *ema){
ema[0] = data[0];
float k = 2.0f/(period+1);
for (int i = 1; i < n; i++)
ema[i] = data[i]*k + ema[i-1]*(1.0f-k);
}
int main(){
/**
* Function that computes a moving average on a vector
*/
int N = 1<<20; // data size
std::cout << "N = " << N << " bytes = " << N*sizeof(float) << std::endl;
int period = 10; // moving average period
// malloc'ed for stack usage instead of small heap size
float *data = (float*)malloc(N*sizeof(float));
float *ema = (float*)malloc(N*sizeof(float));
float *gema = (float*)malloc(N*sizeof(float));
float *d_data; // device pointer for data
float *d_ema; // device pointer for exponential moving average
// CUDA allocate memory for data, SMA, and EMA
cudaMalloc(&d_data, N*sizeof(float));
cudaMalloc(&d_ema, N*sizeof(float));
// initialize data
srand(time(0));
data[0] = rand() % 100 + 50;
for(int i = 1; i < N; i++){
data[i] = data[i-1] + rand() % 11 - 5;
}
// copy data from host to device
cudaMemcpy(d_data, data, N*sizeof(float), cudaMemcpyHostToDevice);
// call device function
long long gpu_t = dtime_usec(0);
gpu_ema<<<(N+255)/256, 256>>>(N, 2.0f/(period+1), d_data, d_ema, 1e-7);
cudaDeviceSynchronize();
gpu_t = dtime_usec(gpu_t);
long long cpu_t = dtime_usec(0);
cpu_ema(N, period, data, ema);
cpu_t = dtime_usec(cpu_t);
if (N < 33)
for (int i = 0; i < N; i++)
std::cout << ema[i] << ",";
std::cout << std::endl;
cudaMemcpy(gema, d_ema, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("some CUDA error");
if (N < 33)
for(int i = 0; i < N; i += 1)
std::cout << gema[i] << ",";
std::cout << std::endl;
float max_err = fabs(gema[0] - ema[0])/ema[0];
for (int i = 1; i < N; i++)
max_err = max(max_err, fabs(gema[i] - ema[i])/ema[0]);
std::cout << "max err: " << max_err*100.0 << "% final gpu: " << gema[N-1] << " final cpu: " << ema[N-1] << std::endl;
std::cout << "cpu time: " << cpu_t/(float)USECPSEC << "s gpu time: " << gpu_t/(float)USECPSEC << "s" << std::endl;
cudaFree(d_data);
cudaFree(d_ema);
return 0;
}
$ nvcc -o t1444 t1444.cu
$ ./t1444
N = 1048576 bytes = 4194304
max err: 0.00218633% final gpu: 1311.38 final cpu: 1311.38
cpu time: 0.006346s gpu time: 0.000214s
$
特斯拉V100,CUDA 10.1
重复,上述代码具有增强性能的早期外观的有效性取决于范围限制的输入数据。我不会尝试仔细介绍统计数据,但是如果您不了解输入数据的统计信息,那么上述方法可能无效。