我正在尝试实现一个cuda图实验。有三个内核,kernel_0、kernel_1和kernel_2。它们将按顺序执行,并且具有相关性。现在我将只捕获kernel_1。这些是我的代码:
#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#define N 50000
#define NSTEP 1000
#define NKERNEL 20
using namespace std::chrono;
static const char *_cudaGetErrorEnum(cudaError_t error) {
return cudaGetErrorName(error);
}
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) "%s" n", file, line,
static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
__global__ void shortKernel_0(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) {
in_d[idx] = 1.0;
out_d[idx]=1 + in_d[idx];
}
}
__global__ void shortKernel_1(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) out_d[idx]=2*in_d[idx];
}
__global__ void shortKernel_2(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) {
out_d[idx]=3*in_d[idx];
}
}
void test(){
size_t size_bytes = N * sizeof(float);
void * in_d_0;
void * out_d_0;
void * out_d_1;
void * out_d_2;
int threads = 128;
int blocks = (N+threads)/threads;
int iter = 10;
cudaStream_t stream;
cudaStreamCreate(&stream);
CUmemoryPool pool_;
cuDeviceGetDefaultMemPool(&pool_, 0);
uint64_t threshold = UINT64_MAX;
cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);
cudaGraph_t graph;
cudaGraphExec_t instance;
bool graphCreated=false;
for (int i =0; i < iter; i++){
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
if (!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
cudaStreamEndCapture(stream, &graph);
checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphUpload(instance, stream));
graphCreated = true;
}else{
checkCudaErrors(cudaGraphLaunch(instance, stream));
}
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
}
cudaDeviceSynchronize();
printf("With async malloc done!");
cudaStreamDestroy(stream);
cudaGraphDestroy(graph);
cudaGraphExecDestroy(instance);
}
int main() {
test();
return 0;
}
kernel_0的输出由kernel_1消耗。并且来自kernel_ 1的输出被kernel_。然而,当我使用compute-sanitizer
运行时,我遇到了一些错误。你知道这个错误吗?附部分错误:
========= Program hit CUDA_ERROR_INVALID_VALUE (error 1) due to "invalid argument" on CUDA API call to cuMemFreeAsync.
========= Saved host backtrace up to driver entry point at error
========= Host Frame: [0x2ef045]
========= in /usr/local/cuda/compat/lib.real/libcuda.so.1
========= Host Frame:test() [0xb221]
========= in /opt/test-cudagraph/./a.out
========= Host Frame:main [0xb4b3]
========= in /opt/test-cudagraph/./a.out
========= Host Frame:__libc_start_main [0x24083]
========= in /usr/lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0xaf6e]
========= in /opt/test-cudagraph/./a.out
1。准确地找出错误发生的位置
为了获得";"想法";,您需要用错误检查来包装所有API调用。正确地执行这一操作有点棘手,因为cudaError_t
运行时-API状态类型和CUresult
驱动程序-API状态模式在所有值上都不一致,因此您需要重载错误检查函数:
void check(cudaError_t result, char const *const func,
const char *const file, int const line)
{
if (result) {
fprintf(stderr, "CUDA runtime error at %s:%d code=%d(%s) "%s" n",
file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
exit(EXIT_FAILURE);
}
}
void check(CUresult result, char const *const func,
const char *const file, int const line)
{
if (result) {
const char* error_name = "(UNKNOWN)";
cuGetErrorName(result, &error_name);
fprintf(stderr, "CUDA driver error at %s:%d code=%d(%s) "%s" n",
file, line, static_cast<unsigned int>(result), error_name, func);
exit(EXIT_FAILURE);
}
}
当你用一个错误检查来包装你所有的呼叫时,运行该程序会得到你:
CUDA driver error at a.cu:102 code=1(CUDA_ERROR_INVALID_VALUE) "cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream)"
触发错误的线路为:
checkCudaErrors(cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream));
即CUDA驱动器认为CCD_ 4不是用于(异步(释放的有效设备指针。
这是一个简单的部分,甚至不是你的程序所特有的。
2.错误
您的代码中有两个问题:
在for循环的第一次循环中,您使用流捕获来捕获图形。以这种方式捕获图形时,在图形捕获过程中不会完成任何实际工作。这意味着在for循环的第一次迭代中,这行
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
什么也不做。不执行分配。不修改out_d_1
。然而,在相同的for循环迭代过程中,您尝试释放此处的指针:cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
,但在特定的for循环循环迭代中,它从未被分配。所以免费的失败了。这解释了与这里的用法相关的cuMemFreeAsync
问题:cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
在捕获过程中,
cuMemFreeAsync
的使用也存在问题,特别是这一行:cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
我们可以看到,在图捕获过程中(即在图执行过程中(,您试图释放的项目(in_d_0
(的分配是在图之外分配的。但这是不允许的。参见cuMemFreeAsync
:的文档
在流捕获过程中,此函数会创建一个空闲节点,因此必须向传递图分配的地址
3.你能做些什么
结合这两项,一种可能的方法来修复您发布的代码如下:
$ cat t2068.cu
#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#define N 50000
#define NSTEP 1000
#define NKERNEL 20
using namespace std::chrono;
static const char *_cudaGetErrorEnum(cudaError_t error) {
return cudaGetErrorName(error);
}
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) "%s" n", file, line,
static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
__global__ void shortKernel_0(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) {
in_d[idx] = 1.0;
out_d[idx]=1 + in_d[idx];
}
}
__global__ void shortKernel_1(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) out_d[idx]=2*in_d[idx];
}
__global__ void shortKernel_2(float * out_d, float * in_d){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) {
out_d[idx]=3*in_d[idx];
}
}
void test(){
size_t size_bytes = N * sizeof(float);
void * in_d_0;
void * out_d_0;
void * out_d_1;
void * out_d_2;
int threads = 128;
int blocks = (N+threads)/threads;
int iter = 10;
cudaStream_t stream;
cudaStreamCreate(&stream);
CUmemoryPool pool_;
cuDeviceGetDefaultMemPool(&pool_, 0);
uint64_t threshold = UINT64_MAX;
cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);
cudaGraph_t graph;
cudaGraphExec_t instance;
bool graphCreated=false;
for (int i =0; i < iter; i++){
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
// moved the next line outside of the graph region
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
if (!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
//cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
cudaStreamEndCapture(stream, &graph);
checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphUpload(instance, stream));
graphCreated = true;
}
// modified so that we run the instantiated graph on every iteration
checkCudaErrors(cudaGraphLaunch(instance, stream));
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
}
cudaDeviceSynchronize();
printf("With async malloc done!n");
cudaStreamDestroy(stream);
cudaGraphDestroy(graph);
cudaGraphExecDestroy(instance);
}
int main() {
test();
return 0;
}
$ nvcc -o t2068 t2068.cu -lcuda
$ compute-sanitizer ./t2068
========= COMPUTE-SANITIZER
With async malloc done!
========= ERROR SUMMARY: 0 errors
$
一个合理的问题可能是";如果在图中不允许释放非图分配,为什么图捕获没有失败"我怀疑答案是,图形捕获机制无法在图形捕获时确定您的CUdeviceptr
是否包含在图形执行期间分配的实体。
您可能还需要考虑避免取消分配和重新分配其他缓冲区。毕竟,缓冲区大小在所有迭代中都是恒定的。
关于图中这种流有序内存分配的一些观察:
- 在图外分配的项不能在图中释放
- 在图中分配的项可以在图中释放
- 在图中分配的项不需要在图执行结束时立即释放,它可以稍后释放(在非图代码中,如这里所示(
- 在图中分配的项应该在图尝试再次分配之前释放,特别是在图再次启动之前。希望原因是显而易见的;这将是典型的内存泄漏。但是,如果忘记了这一点,可能会出现图形运行时错误。您可以在图形实例化时使用控件,在图形启动点自动释放此类分配:
如果[正在启动的图形]创建的任何分配保持未解冻。。。如果hGraphExec未使用CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH实例化,则使用CUDA_ERROR_INVALID_VALUE启动将失败。