CUDA 速度比预期慢 - 图像处理



我是 CUDA 开发的新手,想写一个简单的基准来测试一些图像处理的可行性。我有 32 张图像,每张图像为 720x540,每像素灰度一个字节。

我正在运行基准测试10秒,并计算它们能够处理多少次。我正在运行三个基准测试:

  • 第一种是通过cudaMemcpy将图像传输到GPU全局内存中。
  • 第二种是传输和处理图像。
  • 第三种是在 CPU 上运行等效测试。

对于开始的简单测试,图像处理只是计算高于某个灰度值的像素数。我发现在 GPU 上访问全局内存非常慢。我的基准测试结构使其为每个图像创建一个块,在每个图像中每行创建一个线程。每个线程将其像素计数到共享内存数组中,然后第一个线程将它们汇总(见下文(。

我遇到的问题是这一切都运行得非常慢 - 大约 50fps。比 CPU 版本慢得多 - 大约 230fps。如果我注释掉像素值比较,导致所有像素的计数,我得到 6 倍的性能。我尝试使用纹理内存,但没有看到性能提升。我正在运行Quadro K2000。另外:仅图像复制基准能够以大约 330fps 的速度复制,因此这似乎不是问题。

任何帮助/指示将不胜感激。谢谢。

__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
{
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing https://blog.usejournal.com/cuda-thread-indexing-fb9910cba084
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart   = imageStart + myStartRow * W;
unsigned char *pixelEnd     = pixelStart + W;
unsigned char *pixelItr     = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
{
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
{
row_count++;
}
pixelItr++;
}
row_counts[myStartRow] = row_count;
__syncthreads();
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
{
image_count += row_counts[i];
}
AllReturns[myImage] = image_count;
}
}


extern "C" void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{   
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
Threshold,
W, H,
AllPixels,
AllReturns);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}

对内核设计的两项更改可能会导致显著的加速:

  1. 列而不是按行执行操作。 此处描述了为什么这很重要/有帮助的一般背景。

  2. 将最终操作替换为规范的并行缩减。

根据我的测试,这 2 个更改导致内核性能加速 ~22 倍:

$ cat t49.cu
#include <iostream>
#include <helper_cuda.h>
typedef unsigned char U8;
__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
{
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing https://blog.usejournal.com/cuda-thread-indexing-fb9910cba084
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart   = imageStart + myStartRow * W;
unsigned char *pixelEnd     = pixelStart + W;
unsigned char *pixelItr     = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
{
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
{
row_count++;
}
pixelItr++;
}
row_counts[myStartRow] = row_count;
__syncthreads();
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
{
image_count += row_counts[i];
}
AllReturns[myImage] = image_count;
}
}

__global__ void ThreadPerColCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns, int rsize)
{
extern __shared__ int col_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
unsigned char *imageStart = AllPixels[myImage];
int myStartCol = (threadIdx.y * blockDim.x + threadIdx.x);
int col_count = 0;
for (int i = 0; i < H; i++) if (imageStart[myStartCol+i*W]> Threshold) col_count++;
col_counts[threadIdx.x] = col_count;
__syncthreads();
for (int i = rsize; i > 0; i>>=1){
if ((threadIdx.x+i < W) && (threadIdx.x < i)) col_counts[threadIdx.x] += col_counts[threadIdx.x+i];
__syncthreads();}
if (!threadIdx.x) AllReturns[myImage] = col_counts[0];
}
void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
Threshold,
W, H,
AllPixels,
AllReturns);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
unsigned next_power_of_2(unsigned v){
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;}
void cuda_Benchmark1(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
int rsize = next_power_of_2(W/2);
ThreadPerColCounter<<<nImages, W, sizeof(int)*W>>> (
Threshold,
W, H,
AllPixels,
AllReturns, rsize);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
int main(){
const int my_W = 720;
const int my_H = 540;
const int n_img = 128;
const int my_thresh = 10;
U8 **img_p, **img_ph;
U8 *img, *img_h;
int *res, *res_h, *res_h1;
img_ph = (U8 **)malloc(n_img*sizeof(U8*));
cudaMalloc(&img_p, n_img*sizeof(U8*));
cudaMalloc(&img, n_img*my_W*my_H*sizeof(U8));
img_h = new U8[n_img*my_W*my_H];
for (int i = 0; i < n_img*my_W*my_H; i++) img_h[i] = rand()%20;
cudaMemcpy(img, img_h, n_img*my_W*my_H*sizeof(U8), cudaMemcpyHostToDevice);
for (int i = 0; i < n_img; i++) img_ph[i] = img+my_W*my_H*i;
cudaMemcpy(img_p, img_ph, n_img*sizeof(U8*), cudaMemcpyHostToDevice);
cudaMalloc(&res, n_img*sizeof(int));
cuda_Benchmark(n_img, my_W, my_H, img_p, res, my_thresh);
res_h = new int[n_img];
cudaMemcpy(res_h, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
cuda_Benchmark1(n_img, my_W, my_H, img_p, res, my_thresh);
res_h1 = new int[n_img];
cudaMemcpy(res_h1, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n_img; i++) if (res_h[i] != res_h1[i]) {std::cout << "mismatch at: " << i << " was: " << res_h1[i] << " should be: " << res_h[i] << std::endl; return 0;}
}
$ nvcc -o t49 t49.cu -I/usr/local/cuda/samples/common/inc
$ cuda-memcheck ./t49
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t49
==1756== NVPROF is profiling process 1756, command: ./t49
==1756== Profiling application: ./t49
==1756== Profiling result:
Type  Time(%)      Time     Calls       Avg       Min       Max  Name
GPU activities:   72.02%  54.325ms         1  54.325ms  54.325ms  54.325ms  ThreadPerRowCounter(int, int, int, unsigned char**, int*)
24.71%  18.639ms         2  9.3195ms  1.2800us  18.638ms  [CUDA memcpy HtoD]
3.26%  2.4586ms         1  2.4586ms  2.4586ms  2.4586ms  ThreadPerColCounter(int, int, int, unsigned char**, int*, int)
0.00%  3.1040us         2  1.5520us  1.5360us  1.5680us  [CUDA memcpy DtoH]
API calls:   43.63%  59.427ms         3  19.809ms  18.514us  59.159ms  cudaMalloc
41.70%  56.789ms         2  28.394ms  2.4619ms  54.327ms  cudaDeviceSynchronize
14.02%  19.100ms         4  4.7749ms  17.749us  18.985ms  cudaMemcpy
0.52%  705.26us        96  7.3460us     203ns  327.21us  cuDeviceGetAttribute
0.05%  69.268us         1  69.268us  69.268us  69.268us  cuDeviceTotalMem
0.04%  50.688us         1  50.688us  50.688us  50.688us  cuDeviceGetName
0.04%  47.683us         2  23.841us  14.352us  33.331us  cudaLaunchKernel
0.00%  3.1770us         1  3.1770us  3.1770us  3.1770us  cuDeviceGetPCIBusId
0.00%  1.5610us         3     520ns     249ns     824ns  cuDeviceGetCount
0.00%  1.0550us         2     527ns     266ns     789ns  cuDeviceGet
$

(Quadro K2000, CUDA 9.2.148, Fedora Core 27(

(next_power_of_2代码从这个答案中解脱出来(

我不声称此代码或我发布的任何其他代码的正确性。 任何使用我发布的任何代码的人都需要自行承担风险。我只是声称我试图解决原始帖子中的问题,并提供一些解释。我并不是说我的代码没有缺陷,或者它适合任何特定目的。使用它(或不使用它(的风险由您自行承担。

最新更新