如何在 CUDA 中进行零填充以进行卷积



嗨,我必须使用 CUFFT 卷积两个信号[ pulseMatrixRow[i] 和 pulse[i] ]。 为此,我的代码是 int main(int argc, char **argv)

{
FILE *fileWritePtr;
cufftComplex h_signal[NX*BATCH];
cufftComplex h_filter_signal[NX*BATCH];
cufftComplex hf_signal[NX*BATCH];   
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
    h_signal[i].x = pulseMatrixRow[i];
    h_signal[i].y = pulseMatrixRow[i];
}
// device memory allocation 
    cudaMalloc((void**)&d_signal, sizeof(cufftComplex)*NX*BATCH);
// transfer to device memory
cudaMemcpy(d_signal, h_signal, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);

// Initalize the memory for the filter
for (unsigned int i = 0; i < FILTER_signal_SIZE; ++i)
{
    h_filter_signal[i].x = pulse[i];
    h_filter_signal[i].y = pulse[i];
}

// device memory allocation 
    cudaMalloc((void**)&d_filter_signal, sizeof(cufftComplex)*NX*BATCH);
// transfer to device memory
   cudaMemcpy(d_filter_signal, h_filter_signal, sizeof(cufftComplex)*NX*BATCH,         cudaMemcpyHostToDevice);
  // CUFFT plan
  cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
  // Transform signal and fsignal
 printf("Transforming signal cufftExecC2Cn");
  cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal,     CUFFT_FORWARD);

printf("Transforming filter_signal cufftExecC2Cn");
cufftExecC2C(plan, (cufftComplex *)d_filter_signal, (cufftComplex     *)d_filter_signal, CUFFT_FORWARD);

// Multiply the coefficients together 
ComplexPointwiseMulAndScale<<<blocksPerGrid, threadsPerBlock>>>(d_signal, d_filter_signal, NX, 1.0f/NX*BATCH);

// Transform signal back
printf("Transforming signal back cufftExecC2Cn");
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE);

// transfer results from GPU memory 

cudaMemcpy(hf_signal, d_signal, sizeof(cufftComplex)*NX*BATCH,cudaMemcpyDeviceToHost);

fileWritePtr = fopen("OutputData1.txt","w+");
for(i = 0; i < NX ; i++){
    //printf("%f %fn", i, hf_signal[i].x, hf_signal[i].y);
     fprintf(fileWritePtr,"%d %f %fn", i, hf_signal[i].x, hf_signal[i].y);
     }
fclose(fileWritePtr);

//Destroy CUFFT context
cufftDestroy(plan);
 // cleanup memory
 cudaFree(d_signal);
 cudaFree(d_filter_signal);

 // free(h_signal);
 // free(h_filter_signal);
return 0;
 }

我的 matlab 生成的脉冲矩阵代码如下:

pulse = [ones(1,50) zeros(1,500-50)];
pulseMatrix = repmat(pulse,10,1);
pulseMatrix = pulseMatrix.';
pulseMatrixRow = pulseMatrix(:);

但是我一次只需要处理 1000 个 pulseMatrixRow 样本,然后一一处理 1000 个样本。由于我的fft是1024,请告诉我如何以及在哪个阶段我必须在输入信号的末尾填充零,以及我的滤波器信号,简单地给出为: 脉冲 = [一(1,50) 零(1,500-50)];

您可以使用memset()将主机内存的填充归零,然后再将其传输到设备内存,或者

您可以使用cudaMemset()在执行FFT之前以及主机到设备内存传输之后将设备MEM的填充归零。

请参阅此链接了解如何使用memset()

请参阅此链接了解如何使用cudaMemset()

最新更新