我正在编写一个用于图像处理的CUDA程序。将为RGB通道启动相同的内核"processOneChannel"。
下面,我尝试为三个内核启动指定流,以便可以同时处理它们。但nvprof表示,它们仍在陆续推出。。。
在这三个内核之前和之后还有另外两个内核,我不希望它们同时运行。
基本上我想要以下内容:单独通道-->处理一个通道(x3)-->重组通道
请告诉我做错了什么。。
void kernelLauncher(const ushort4 * const h_inputImageRGBA, ushort4 * const d_inputImageRGBA,
ushort4* const d_outputImageRGBA, const size_t numRows, const size_t numCols,
unsigned short *d_redProcessed,
unsigned short *d_greenProcessed,
unsigned short *d_blueProcessed,
unsigned short *d_prand)
{
int MAXTHREADSx = 512;
int MAXTHREADSy = 1;
int nBlockX = numCols / MAXTHREADSx + 1;
int nBlockY = numRows / MAXTHREADSy + 1;
const dim3 blockSize(MAXTHREADSx,MAXTHREADSy,1);
const dim3 gridSize(nBlockX,nBlockY,1);
// cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
int nstreams = 5;
cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++)
{
checkCudaErrors(cudaStreamCreateWithFlags(&(streams[i]),cudaStreamNonBlocking));
}
separateChannels<<<gridSize,blockSize>>>(d_inputImageRGBA,
(int)numRows,
(int)numCols,
d_red,
d_green,
d_blue);
cudaDeviceSynchronize();
checkCudaErrors(cudaGetLastError());
processOneChannel<<<gridSize,blockSize,0,streams[0]>>>(d_red,
d_redProcessed,
(int)numRows,(int)numCols,
d_filter,d_prand);
processOneChannel<<<gridSize,blockSize,0,streams[1]>>>(d_green,
d_greenProcessed,
(int)numRows,(int)numCols,
d_filter,d_prand);
processOneChannel<<<gridSize,blockSize,0,streams[2]>>>(d_blue,
d_blueProcessed,
(int)numRows,(int)numCols,
d_filter,d_prand);
cudaDeviceSynchronize();
checkCudaErrors(cudaGetLastError());
recombineChannels<<<gridSize, blockSize>>>(d_redProcessed,
d_greenProcessed,
d_blueProcessed,
d_outputImageRGBA,
numRows,
numCols);
for (int i = 0; i < nstreams; i++)
{
cudaStreamDestroy(streams[i]);
}
free(streams);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
以下是nvprof gpu跟踪输出。请注意,在内核启动之前,memcpy将传递用于处理的过滤数据,因此它们不能与内核启动并行运行。
==10001== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
1.02428s 2.2400us - - - - - 28.125MB 1e+04GB/s GeForce GT 750M 1 13 [CUDA memset]
1.02855s 18.501ms - - - - - 28.125MB 1.4846GB/s GeForce GT 750M 1 13 [CUDA memcpy HtoD]
1.21959s 1.1371ms - - - - - 1.7580MB 1.5098GB/s GeForce GT 750M 1 13 [CUDA memcpy HtoD]
1.22083s 1.3440us - - - - - 7.0313MB 5e+03GB/s GeForce GT 750M 1 13 [CUDA memset]
1.22164s 1.3440us - - - - - 7.0313MB 5e+03GB/s GeForce GT 750M 1 13 [CUDA memset]
1.22243s 3.6480us - - - - - 7.0313MB 2e+03GB/s GeForce GT 750M 1 13 [CUDA memset]
1.22349s 10.240us - - - - - 8.0000KB 762.94MB/s GeForce GT 750M 1 13 [CUDA memcpy HtoD]
1.22351s 6.6021ms (6 1441 1) (512 1 1) 12 0B 0B - - GeForce GT 750M 1 13 separateChannels(...) [123]
1.23019s 10.661ms (6 1441 1) (512 1 1) 36 192B 0B - - GeForce GT 750M 1 14 processOneChannel(...) [133]
1.24085s 10.518ms (6 1441 1) (512 1 1) 36 192B 0B - - GeForce GT 750M 1 15 processOneChannel(...) [141]
1.25137s 10.779ms (6 1441 1) (512 1 1) 36 192B 0B - - GeForce GT 750M 1 16 processOneChannel(...) [149]
1.26372s 5.7810ms (6 1441 1) (512 1 1) 15 0B 0B - - GeForce GT 750M 1 13 recombineChannels(...) [159]
1.26970s 19.859ms - - - - - 28.125MB 1.3831GB/s GeForce GT 750M 1 13 [CUDA memcpy DtoH]
这里是CMakeList.txt,我在这里向nvcc 传递了每个线程的默认流
cmake_minimum_required(VERSION 2.6 FATAL_ERROR)
find_package(OpenCV REQUIRED)
find_package(CUDA REQUIRED)
set(
CUDA_NVCC_FLAGS
${CUDA_NVCC_FLAGS};
-default-stream per-thread
)
file( GLOB hdr *.hpp *.h )
file( GLOB cu *.cu)
SET (My_files main.cpp)
# Project Executable
CUDA_ADD_EXECUTABLE(My ${My_files} ${hdr} ${cu})
target_link_libraries(My ${OpenCV_LIBS})
每个内核都在启动6*1441,它超过8000个块,每个块有512个线程。这就是填充机器,防止后续内核启动的块执行。
机器具有容量。以块为单位的最大瞬时容量等于GPU中的SM数量乘以每个SM的最大块数量,这两个规格都可以通过deviceQuery应用程序检索。当你填满它时,它无法处理更多的块,直到一些已经运行的块退役。这个过程将在第一次内核启动时继续,直到大多数块退役。然后第二个内核将开始执行。