Cuda中的FFT,输入4096,结果错误



我在CUDA中实现了1d fft。以下是代码:

// DIT FFT algorithm
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <time.h>
#define PI 3.141592653
struct complex
{
float real, imag;
};
void fill(complex *x, int length)
{
for (int i = 0; i < length; i++)
{
x[i].real = i;
x[i].imag = 0;
}
}
__device__ int bit_reversal(int x, float N)
{
float log2N = log2f(N);
int n = 0;
for (int i = 0; i < log2N; i++)
{
n <<= 1;
n |= (x & 1);
x >>= 1;
}
return n;
}
__global__ void fft(complex *x, complex *rev_x, int N)
{
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
int butterfly_width, step;
struct complex wn, temp1, temp2;
float stages = log2f(N);
//bit reversal
int r = bit_reversal(thread_id, N);
rev_x[thread_id] = x[r];
__syncthreads();
//constant expression, used in twiddle factor
const double twoPIdivN = 2 * PI / N;
if (N > 1))
{
for (int stage = 1; stage <= stages; stage++)
{
step = 1 << stage;
butterfly_width = step >> 1;
int pos = thread_id / butterfly_width * step;
//printf("pos=%d ", pos);
int j = thread_id % butterfly_width;
int res = pos + j;
if (res < N)
{
//printf("thread: %d, pos: %d, j: %d, res:%d ", thread_id, pos, j, res);
//Wn = e^(-j*2*PI/N) converted with euler's formula(real and imaginary parts)
wn.real = cos(twoPIdivN * j * N / step);
wn.imag = -sin(twoPIdivN * j * N / step);
temp1.real = rev_x[res].real;
temp1.imag = rev_x[res].imag;
temp2.real = rev_x[res + butterfly_width].real * wn.real - rev_x[res + butterfly_width].imag * wn.imag;
temp2.imag = rev_x[res + butterfly_width].imag * wn.real + rev_x[res + butterfly_width].real * wn.imag;
rev_x[res].real = temp1.real + temp2.real;
rev_x[res].imag = temp1.imag + temp2.imag;
rev_x[res + butterfly_width].real = temp1.real - temp2.real;
rev_x[res + butterfly_width].imag = temp1.imag - temp2.imag;
//printf("(%1.2f %1.2fj) ", rev_x[thread_id].real, rev_x[thread_id].imag);
//printf("(%1.2f %1.2fj) ", rev_x[thread_id + butterfly_width].real, rev_x[thread_id + butterfly_width].imag);
//printf("(%d %d)", res, res+butterfly_width);
__syncthreads();
//printf("n");
}
}
}
}
int main()
{

int N = 4096 // input number of elements
int threads = 1024;
int blocks = (N / threads == 0) ? 1 : N / threads;

struct complex *input = (struct complex *)malloc(N * sizeof(struct complex));
// struct complex *rev_input = (struct complex *)malloc(N * sizeof(struct complex));

fill(input, N);
struct complex *dev_input;
struct complex *dev_rev_input;
size_t size = N * sizeof(struct complex);
cudaMalloc((void **)&dev_input, size);
cudaMalloc((void **)&dev_rev_input, size);
cudaMemcpy(dev_input, input, size, cudaMemcpyHostToDevice);

fft<<<blocks, threads>>>(dev_input, dev_rev_input, N);

cudaMemcpy(input, dev_rev_input, size, cudaMemcpyDeviceToHost);

//result fft
// printf("nResult of fft: n");
// for (int i = 0; i <= 1; i++)
// {
//     printf(" %d- (%1.2f , %1.2fj)n", i, input[i].real, input[i].imag);
//     printf(" %d- (%1.2f , %1.2fj)n", N - i - 1, input[N - 1 - i].real, input[N - 1 - i].imag);
// }

free(input);
cudaFree(dev_input);
return 0;
}

在printf中,我采集了一些样本,并用MATLAB进行了验证
我的输入序列长度为4096
当线程数=1024(我的gpu支持1024个线程/块(时,算法运行良好
如果线程数=512,则错误的结果更少。问题出在哪里。有人知道吗?

EDIT我移到了内核之外。代码现在是:

__global__ void fft(complex *x, complex *rev_x, int N, int stage, int butterfly_width, int step){
struct complex wn, temp1, temp2;
//constant expression, used in twiddle factor 
const double twoPIdivN = 2 * PI / N;
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
if(stage==1){
int r = bit_reversal(thread_id, N);
rev_x[thread_id] = x[r];
__syncthreads();
}
int pos = thread_id / butterfly_width * step;
int j = thread_id % butterfly_width;
int res = pos + j;
if (res < N)
{
//Wn = e^(-j*2*PI/N) converted with euler's formula(real and imaginary parts)
wn.real = cos(twoPIdivN * j * N / step);
wn.imag = -sin(twoPIdivN * j * N / step);
temp1.real = rev_x[res].real;
temp1.imag = rev_x[res].imag;
temp2.real = rev_x[res + butterfly_width].real * wn.real - rev_x[res + butterfly_width].imag * wn.imag;
temp2.imag = rev_x[res + butterfly_width].imag * wn.real + rev_x[res + butterfly_width].real * wn.imag;
rev_x[res].real = temp1.real + temp2.real;
rev_x[res].imag = temp1.imag + temp2.imag;
rev_x[res + butterfly_width].real = temp1.real - temp2.real;
rev_x[res + butterfly_width].imag = temp1.imag - temp2.imag;

__syncthreads();
}     
}
void fft_caller(complex *x, complex *rev_x, int N)
{
dim3 threads(BLOCK_SIZE);
dim3 blocks(GRID_SIZE);
// int threads = 1024;
// int blocks=1;
float stages = log2f(N);
int butterfly_width, step;
if (N > 1 && is_power_of_two(N))
{
for (int stage = 1; stage <= stages; stage++)
{   
printf("%d ", stage);
step = 1 << stage;
butterfly_width = step >> 1;
fft<<<blocks, threads>>>(x, rev_x, N, stage, butterfly_width, step);
}
}

}

现在算法工作良好,直到输入长度:16384(2^14(。对于更大的投入会产生错误的结果。怎么回事?

编辑

我把一个内核分成三个内核。以下是代码:

__global__ void preproccess(complex *x, complex *rev_x, int N, int stage){
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
int r = bit_reversal(thread_id, N);
}
__global__ void compute_temp(complex *rev_x, int N, int stage, int butterfly_width, int step, complex *temp1, complex *temp2){
struct complex wn;
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
//constant expression, used in twiddle factor 
const double twoPIdivN = 2 * PI / N;

int pos = thread_id / butterfly_width * step;
int j = thread_id % butterfly_width;
int res = pos + j;
if (res < N)
{
//Wn = e^(-j*2*PI/N) converted with euler's formula(real and imaginary parts)
wn.real = cos(twoPIdivN * j * N / step);
wn.imag = -sin(twoPIdivN * j * N / step);
temp1[thread_id].real = rev_x[res].real;
temp1[thread_id].imag = rev_x[res].imag;
temp2[thread_id].real = rev_x[res + butterfly_width].real * wn.real - rev_x[res + butterfly_width].imag * wn.imag;
temp2[thread_id].imag = rev_x[res + butterfly_width].imag * wn.real + rev_x[res + butterfly_width].real * wn.imag;

}
}
__global__ void fft(complex *rev_x, int N, int stage, int butterfly_width, int step, complex *temp1, complex *temp2){

int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
//constant expression, used in twiddle factor 
const double twoPIdivN = 2 * PI / N;

int pos = thread_id / butterfly_width * step;
int j = thread_id % butterfly_width;
int res = pos + j;
if (res < N)
{
rev_x[res].real = temp1[thread_id].real + temp2[thread_id].real;
rev_x[res].imag = temp1[thread_id].imag + temp2[thread_id].imag;
rev_x[res + butterfly_width].real = temp1[thread_id].real - temp2[thread_id].real;
rev_x[res + butterfly_width].imag = temp1[thread_id].imag - temp2[thread_id].imag;
}     
}
void fft_caller(complex *x, complex *rev_x, int N)
{
dim3 threads(BLOCK_SIZE);
dim3 blocks(GRID_SIZE);
// int threads = 1024;
// int blocks=1;
float stages = log2f(N);
int butterfly_width, step;
struct complex *temp1;
struct complex *temp2;
size_t size = N * sizeof(struct complex);
gpuErrchk(cudaMalloc((void **)&temp1, size));
gpuErrchk(cudaMalloc((void **)&temp2, size));

if (N > 1 && is_power_of_two(N))
{
for (int stage = 1; stage <= stages; stage++)
{   
step = 1 << stage;
butterfly_width = step >> 1;
if(stage==1)
preproccess<<<blocks,threads>>>(x, rev_x, N, stage);
compute_temp<<<blocks, threads>>>(rev_x, N, stage, butterfly_width, step, temp1, temp2);
fft<<<blocks, threads>>>(rev_x, N, stage, butterfly_width, step, temp1, temp2);

cudaDeviceSynchronize();

}
}
}

__syncthreads()仅在块内同步,而不在块之间同步。

最简单的方法是在循环中分别为每个阶段调用内核fft<<<>>>,而不是在内核中调用循环。

float stages = log2f(N);
for (int stage = 1; stage <= stages; stage++)
fft<<<blocks, threads>>>(dev_input, dev_rev_input, N, stage);

请确保在内核fft中,在计算temp1temp2值并将其写回(我强烈认为是这样(之间实际上不应该发生同步,而不是像当前__syncthreads()调用所指示的那样在阶段结束时发生同步。

在这种情况下,您可以在一次内核启动中处理前一阶段的后半部分和下一阶段的前半部分,并将中间结果保存在临时全局内存中。在第一次和最后一次内核启动时必须特别考虑,因为只需要执行一半。

(我认为实现是为了自学习或类似的东西,因为在GPU架构上进行FFT可能会有很多性能改进,这远远超出了堆栈溢出问题的范围。(

最新更新