如何使用Cuda驱动程序api cuLaunchKernel正确传递参数给内核?

我正在创建一些设备缓冲区，我试图使用cuda驱动程序API传递给一个简单的内核。我正在创建三个设备缓冲区并将它们存储在std::vector中。

std::vector<void *> kernel_arguments;
std::vector<float> a = {2};
std::vector<float> b = {3};
for (auto &input : {a, b}) {
CUdeviceptr ptr;
cuMemAlloc(&ptr, input.size()*sizeof(float));
cuMemcpyHtoD(ptr, input.data(), input.size()*sizeof(float));
kernel_arguments.push_back(reinterpret_cast<void *> (&ptr));
}
std::vector<float> c(1);
for (auto &output : {c}) {
CUdeviceptr ptr;
cuMemAlloc(&ptr, output.size()*sizeof(float));
kernel_arguments.push_back(reinterpret_cast<void *> (&ptr));
}
CUresult result = cuLaunchKernel(function, 1, 1, 1,
1024, 1, 1, 0, stream,
kernel_arguments.data(), NULL)
const char *error;
cuGetErrorString(result, &error);
std::cout << result << " " << error << std::end;
result = cuStreamSynchronize(stream);
cuGetErrorString(result, &error);
std::cout << result << " " << error << std::end;

内核函数是一个简单的带有三个参数的加法内核。

__global__ void add_kernel(
float *i_1,
float *i_2,
float *o_3) {
const size_t index = blockIdx.x*blockDim.x + threadIdx.x;
if (index < 1) {
printf("index %dn", index);
printf("%pn", i_1);
printf("%fn", *i_1);
const float r_1 = i_1[index];
printf("%pn", i_2);
printf("%fn", *i_2);
const float r_2 = i_2[index];
const float r_3 = r_1 + r_2;
o_3[index] = r_3;
}
}

运行这个命令，我得到了输出。

0 no error
index 0
0x14cf4c400200
3.000000
0x14cf4c400200
3.000000
700 an illegal memory access was encountered

为什么我得到相同的指针值为第一个和第二个参数，为什么它似乎是我的第二个设备缓冲区结束在第一个参数?

当您回推位于堆栈上的值时，这种方法有效，但当您回推堆栈位置的地址时就不适用了-从for循环的一次迭代到下一次迭代不会改变:

for (auto &input : {a, b}) {
CUdeviceptr ptr;  // a stack variable
cuMemAlloc(&ptr, input.size()*sizeof(float));
cuMemcpyHtoD(ptr, input.data(), input.size()*sizeof(float));
kernel_arguments.push_back(reinterpret_cast<void *> (&ptr));  //**
}
// ptr is out of scope here

这就解释了为什么第一个和第二个参数似乎都引用了第二个内核输入参数(i_2,3)。

否则，当我围绕您所展示的构建完整的代码时，我不会得到任何错误700(然而，一旦该变量超出范围，栈变量的地址的推送/使用也会导致UB/非法访问)

下面是一个示例(从vectorAddDrv样例代码修改)，修改了您的分配循环(即在每次迭代时覆盖堆栈值)，以解决该问题:

$ cat vectorAddDrv.cpp
// Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <cuda.h>
// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>
// includes, CUDA
#include <builtin_types.h>
#include <vector>
using namespace std;
// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;
//define input fatbin file
#ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif
// Host code
int main(int argc, char **argv)
{
// Initialize
checkCudaErrors(cuInit(0));
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// Create context
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results
string module_path;
std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
{
exit(EXIT_FAILURE);
}
else
{
printf("> initCUDA loading module: <%s>n", module_path.c_str());
}
if (!fatbin.str().size())
{
printf("fatbin file empty. exiting..n");
exit(EXIT_FAILURE);
}
// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// Get function handle from module
checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
// your code, modified
std::vector<void *> kernel_arguments;
std::vector<float> a = {2};
std::vector<float> b = {3};
for (auto &input : {a, b}) {
CUdeviceptr *ptr = new CUdeviceptr;
cuMemAlloc(ptr, input.size()*sizeof(float));
cuMemcpyHtoD(*ptr, input.data(), input.size()*sizeof(float));
kernel_arguments.push_back(ptr);
}
std::vector<float> c(1);
for (auto &output : {c}) {
CUdeviceptr *ptr = new CUdeviceptr;
cuMemAlloc(ptr, output.size()*sizeof(float));
kernel_arguments.push_back(ptr);
}
CUresult result = cuLaunchKernel(vecAdd_kernel, 1, 1, 1,
1024, 1, 1, 0, NULL,
kernel_arguments.data(), NULL);
const char *error;
cuGetErrorString(result, &error);
std::cout << result << " " << error << std::endl;
checkCudaErrors(cuCtxSynchronize());
cuGetErrorString(result, &error);
std::cout << result << " " << error << std::endl;
for (auto &c : kernel_arguments) cuMemFree(*(reinterpret_cast<CUdeviceptr *>(c)));  // this works since all of the kernel arguments in this case happen to be CUdeviceptr
exit(EXIT_SUCCESS);
}
$ nvcc -I/usr/local/cuda/samples/common/inc  -o test vectorAddDrv.cpp  -lcuda
$ compute-sanitizer ./test
========= COMPUTE-SANITIZER
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath found file at <./vectorAdd_kernel64.fatbin>
> initCUDA loading module: <./vectorAdd_kernel64.fatbin>
0 no error
index 0
0x7f8023c00000
2.000000
0x7f8023c00200
3.000000
0 no error
========= ERROR SUMMARY: 0 errors
$

相关内容

最新更新

热门标签：