循环中捕获的CUDA图和异步内存分配出错



我正在尝试实现一个cuda图实验。有三个内核,kernel_0、kernel_1和kernel_2。它们将按顺序执行,并且具有相关性。现在我将只捕获kernel_1。这些是我的代码:


#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#define N 50000
#define NSTEP 1000
#define NKERNEL 20
using namespace std::chrono;
static const char *_cudaGetErrorEnum(cudaError_t error) {
  return cudaGetErrorName(error);
}
template <typename T>
void check(T result, char const *const func, const char *const file,
           int const line) {
  if (result) {
    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) "%s" n", file, line,
            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    exit(EXIT_FAILURE);
  }
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
__global__ void shortKernel_0(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        in_d[idx] = 1.0;
        out_d[idx]=1 + in_d[idx];
    }
}
__global__ void shortKernel_1(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) out_d[idx]=2*in_d[idx];
}
__global__ void shortKernel_2(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        out_d[idx]=3*in_d[idx];
    }
}
void test(){
    size_t size_bytes = N * sizeof(float);
    void * in_d_0;
    void * out_d_0;
    void * out_d_1;
    void * out_d_2;
    int threads = 128;
    int blocks = (N+threads)/threads;
    int iter = 10;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    CUmemoryPool pool_;
    cuDeviceGetDefaultMemPool(&pool_, 0);
    uint64_t threshold = UINT64_MAX;
    cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);
    cudaGraph_t graph;
    cudaGraphExec_t instance;
    bool graphCreated=false;
    for (int i =0; i < iter; i++){
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
        shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
        if (!graphCreated){
            cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
            cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
            cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
            shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
            cudaStreamEndCapture(stream, &graph);
            checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
            checkCudaErrors(cudaGraphUpload(instance, stream));
            graphCreated = true;
        }else{
            checkCudaErrors(cudaGraphLaunch(instance, stream));
        }
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
        shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));
       
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
    }
   
    cudaDeviceSynchronize();        
    printf("With async malloc done!");
    cudaStreamDestroy(stream);
    cudaGraphDestroy(graph);
    cudaGraphExecDestroy(instance);
}
int main() {
    test();
    return 0;
}

kernel_0的输出由kernel_1消耗。并且来自kernel_ 1的输出被kernel_。然而,当我使用compute-sanitizer运行时,我遇到了一些错误。你知道这个错误吗?附部分错误:

========= Program hit CUDA_ERROR_INVALID_VALUE (error 1) due to "invalid argument" on CUDA API call to cuMemFreeAsync.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x2ef045]
=========                in /usr/local/cuda/compat/lib.real/libcuda.so.1
=========     Host Frame:test() [0xb221]
=========                in /opt/test-cudagraph/./a.out
=========     Host Frame:main [0xb4b3]
=========                in /opt/test-cudagraph/./a.out
=========     Host Frame:__libc_start_main [0x24083]
=========                in /usr/lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0xaf6e]
=========                in /opt/test-cudagraph/./a.out

1。准确地找出错误发生的位置

为了获得";"想法";,您需要用错误检查来包装所有API调用。正确地执行这一操作有点棘手,因为cudaError_t运行时-API状态类型和CUresult驱动程序-API状态模式在所有值上都不一致,因此您需要重载错误检查函数:

void check(cudaError_t result, char const *const func, 
   const char *const file, int const line) 
{
  if (result) {
    fprintf(stderr, "CUDA runtime error at %s:%d code=%d(%s) "%s" n", 
    file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    exit(EXIT_FAILURE);
  }
}
void check(CUresult result, char const *const func, 
   const char *const file, int const line) 
{
  if (result) {
    const char* error_name = "(UNKNOWN)";
    cuGetErrorName(result, &error_name);
    fprintf(stderr, "CUDA driver error at %s:%d code=%d(%s) "%s" n", 
    file, line, static_cast<unsigned int>(result), error_name, func);
    exit(EXIT_FAILURE);
  }
}

当你用一个错误检查来包装你所有的呼叫时,运行该程序会得到你:

CUDA driver error at a.cu:102 code=1(CUDA_ERROR_INVALID_VALUE) "cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream)" 

触发错误的线路为:

checkCudaErrors(cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream));

即CUDA驱动器认为CCD_ 4不是用于(异步(释放的有效设备指针。

这是一个简单的部分,甚至不是你的程序所特有的。

2.错误

您的代码中有两个问题:

  1. 在for循环的第一次循环中,您使用流捕获来捕获图形。以这种方式捕获图形时,在图形捕获过程中不会完成任何实际工作。这意味着在for循环的第一次迭代中,这行cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);什么也不做。不执行分配。不修改out_d_1。然而,在相同的for循环迭代过程中,您尝试释放此处的指针:cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);,但在特定的for循环循环迭代中,它从未被分配。所以免费的失败了。这解释了与这里的用法相关的cuMemFreeAsync问题:cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);

  2. 在捕获过程中,cuMemFreeAsync的使用也存在问题,特别是这一行:cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);我们可以看到,在图捕获过程中(即在图执行过程中(,您试图释放的项目(in_d_0(的分配是在图之外分配的。但这是不允许的。参见cuMemFreeAsync:的文档

在流捕获过程中,此函数会创建一个空闲节点,因此必须向传递图分配的地址

3.你能做些什么

结合这两项,一种可能的方法来修复您发布的代码如下:

$ cat t2068.cu
#include <stdio.h>
#include <chrono>
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#define N 50000
#define NSTEP 1000
#define NKERNEL 20
using namespace std::chrono;
static const char *_cudaGetErrorEnum(cudaError_t error) {
  return cudaGetErrorName(error);
}
template <typename T>
void check(T result, char const *const func, const char *const file,
           int const line) {
  if (result) {
    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) "%s" n", file, line,
            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
    exit(EXIT_FAILURE);
  }
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
__global__ void shortKernel_0(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        in_d[idx] = 1.0;
        out_d[idx]=1 + in_d[idx];
    }
}
__global__ void shortKernel_1(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) out_d[idx]=2*in_d[idx];
}
__global__ void shortKernel_2(float * out_d, float * in_d){
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<N) {
        out_d[idx]=3*in_d[idx];
    }
}
void test(){
    size_t size_bytes = N * sizeof(float);
    void * in_d_0;
    void * out_d_0;
    void * out_d_1;
    void * out_d_2;
    int threads = 128;
    int blocks = (N+threads)/threads;
    int iter = 10;
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    CUmemoryPool pool_;
    cuDeviceGetDefaultMemPool(&pool_, 0);
    uint64_t threshold = UINT64_MAX;
    cuMemPoolSetAttribute(pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold);
    cudaGraph_t graph;
    cudaGraphExec_t instance;
    bool graphCreated=false;
    for (int i =0; i < iter; i++){
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&in_d_0), size_bytes, pool_, stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_0), size_bytes, pool_, stream);
        shortKernel_0<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_0), reinterpret_cast<float *>(in_d_0));
        // moved the next line outside of the graph region
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
        if (!graphCreated){
            cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
            cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_1), size_bytes, pool_, stream);
            //cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(in_d_0), stream);
            shortKernel_1<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_1), reinterpret_cast<float *>(out_d_0));
            cudaStreamEndCapture(stream, &graph);
            checkCudaErrors(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
            checkCudaErrors(cudaGraphUpload(instance, stream));
            graphCreated = true;
        }
        // modified so that we run the instantiated graph on every iteration
        checkCudaErrors(cudaGraphLaunch(instance, stream));
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_0), stream);
        cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&out_d_2), size_bytes, pool_, stream);
        shortKernel_2<<<blocks, threads,0, stream>>>(reinterpret_cast<float *>(out_d_2), reinterpret_cast<float *>(out_d_1));
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_1), stream);
        cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(out_d_2), stream);
    }
    cudaDeviceSynchronize();
    printf("With async malloc done!n");
    cudaStreamDestroy(stream);
    cudaGraphDestroy(graph);
    cudaGraphExecDestroy(instance);
}
int main() {
    test();
    return 0;
}
$ nvcc -o t2068 t2068.cu -lcuda
$ compute-sanitizer ./t2068
========= COMPUTE-SANITIZER
With async malloc done!
========= ERROR SUMMARY: 0 errors
$

一个合理的问题可能是";如果在图中不允许释放非图分配,为什么图捕获没有失败"我怀疑答案是,图形捕获机制无法在图形捕获时确定您的CUdeviceptr是否包含在图形执行期间分配的实体。

您可能还需要考虑避免取消分配和重新分配其他缓冲区。毕竟,缓冲区大小在所有迭代中都是恒定的。

关于图中这种流有序内存分配的一些观察:

  • 在图外分配的项不能在图中释放
  • 在图中分配的项可以在图中释放
  • 在图中分配的项不需要在图执行结束时立即释放,它可以稍后释放(在非图代码中,如这里所示(
  • 在图中分配的项应该在图尝试再次分配之前释放,特别是在图再次启动之前。希望原因是显而易见的;这将是典型的内存泄漏。但是,如果忘记了这一点,可能会出现图形运行时错误。您可以在图形实例化时使用控件,在图形启动点自动释放此类分配:

如果[正在启动的图形]创建的任何分配保持未解冻。。。如果hGraphExec未使用CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH实例化,则使用CUDA_ERROR_INVALID_VALUE启动将失败。

相关内容

  • 没有找到相关文章

最新更新