C语言 cudaLaunchKernel启动内核失败



我正在尝试使用运行时API启动内核函数。由于某种原因,我不能直接调用cudaLaunchKernel。相反,我调用了一个调用cudaLaunchKernel内部的函数。下面是一个示例,它只是从设备打印一条消息:

#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
__global__
void hello()
{
printf(“hello from kernel. n”);
}
template<typename T>
int launchKernel (T kernel , const size_t grid[3] , const size_t block[3])
{
cudaError_t res;
dim3 grid3d = {(unsigned int)grid[0] , (unsigned int)grid[1] , (unsigned int)grid[2]};
dim3 block3d = {(unsigned int)block[0] , (unsigned int)block[1] , (unsigned int)block[2]};
res = cudaLaunchKernel ((void*)kernel , grid3d , block3d, NULL, 0, NULL);
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch n”);
return -1;
}
return 0;
}
int main(void)
{
float *hx, *dx;
hx = (float*)malloc(32 * sizeof(float));
cudaMalloc(&dx, 32 * sizeof(float));
unsigned int threads = 32;
unsigned int blocks = 1;
///////////// option 1: directly call runtime api: cudaLaunchKernel //////////////
//cudaLaunchKernel((void*)hello, dim3(blocks), dim3(threads), NULL, 0, NULL);
//////////////////////////////////////////////////////////////////////////////////
///////// option 2: call a function which further calls cudaLaunchKernel /////////
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
launchKernel (hello , grid3d , block3d);
//////////////////////////////////////////////////////////////////////////////////
cudaMemcpy(hx, dx, 32 * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dx);
free(hx);
return 0;
}

选项1,直接调用cudaLaunchKernel,工作。但是,间接调用cudaLaunchKernel的选项2不起作用。使用选项2,没有从设备打印消息,并且返回值不等于CUDA_SUCCESS。

我想知道有没有人对这个问题有什么见解。

提前感谢您的帮助和时间。

网格和块尺寸不能为零:

const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};

你的两次启动行为不同的原因是你创建网格和块维度变量不同。

如果改成:

const size_t grid3d[3] = {blocks, 1, 1};
const size_t block3d[3] = {threads, 1, 1};

这两种情况都可以。

顺便说一下,使用这种错误捕获对你自己没有任何好处:
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch n”);
return -1;
}

这样更有启发性:

if (res != cudaSuccess)
{
printf (“error during kernel launch: %s n”, cudaGetErrorString(res));
return -1;
}

最新更新