为什么CUDA_INVALID_VALUE的cuMemAddressReserve()失败

考虑以下程序(用C语法编写(：

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
int main() {
CUresult result;
unsigned int init_flags = 0;
result = cuInit(init_flags);
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
CUcontext ctx;
unsigned int ctx_create_flags = 0;
CUdevice device_id = 0;
result = cuCtxCreate(&ctx, ctx_create_flags, device_id);
// Note: The created context is also made the current context,
// so we are _in_ a context from now on.
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
CUdeviceptr requested = 0;
CUdeviceptr reserved;
size_t size = 0x20000;
size_t alignment = 0; // default
unsigned long long reserve_flags = 0;
// -----------------------------------
// ==>> FAILURE on next statement <<==
// -----------------------------------
result = cuMemAddressReserve(&reserved, size, alignment, requested, reserve_flags);
if (result != CUDA_SUCCESS) {
const char* error_string;
cuGetErrorString(result, &error_string);
fprintf(stderr, "cuMemAddressReserve() failed: %sn", error_string);
exit(EXIT_FAILURE);
}
return 0;
}

尝试预订时失败：

cuMemAddressReserve() failed: invalid argument

我的论点怎么了？是这个尺寸吗？对齐？正在请求0的地址？如果是后者——当我真的不在乎的时候，我怎么知道该申请什么地址呢？

如果我没有记错，虚拟内存管理函数的大小必须是CUDA分配粒度的倍数。请参阅cuMemGetAllocationGranularity和此博客文章https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/

以下在我的机器上工作。

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
int main() {
CUresult result;
unsigned int init_flags = 0;
result = cuInit(init_flags);
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
CUcontext ctx;
unsigned int ctx_create_flags = 0;
CUdevice device_id = 0;
result = cuCtxCreate(&ctx, ctx_create_flags, device_id);
// Note: The created context is also made the current context,
// so we are _in_ a context from now on.
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
CUdeviceptr requested = 0;
CUdeviceptr reserved;
size_t size = 0x20000;
size_t alignment = 0; // default
unsigned long long reserve_flags = 0;
size_t granularity;
CUmemAllocationProp prop;
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = (int)0;
prop.win32HandleMetaData = NULL;
result = cuMemGetAllocationGranularity (&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM );
if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
printf("minimum granularity %lun", granularity);

size_t padded_size = ((granularity + size - 1) / granularity) * granularity;
result = cuMemAddressReserve(&reserved, padded_size, alignment, requested, reserve_flags);
if (result != CUDA_SUCCESS) {
const char* error_string;
cuGetErrorString(result, &error_string);
fprintf(stderr, "cuMemAddressReserve() failed: %sn", error_string);
exit(EXIT_FAILURE);
}
return 0;
}

tl；dr：您的保留区域大小不是(某些设备(分配粒度的倍数

正如@AbatorAbetor所建议的，cuMemAddressReserve()隐含地要求内存区域的大小是某个粒度值的倍数。尽管0x20000看起来是一个足够大的值(2^21字节…系统内存页通常为4KiB=2^12字节(，但NVIDIA GPU在这里的要求非常高。

例如，Pascal GTX 1050 Ti GPU的内存约为4GB，其粒度为0x200000，或2 MiB，是您试图分配的粒度的16倍。

现在，如果我们有两个具有不同粒度值的设备，会发生什么？我们需要使用最小公倍数吗？谁知道呢。

总之，底线是：在分配和保留之前都要检查粒度。

我已经将此作为文档错误提交给NVIDIA，错误3486420(但您可能无法关注该链接，因为NVIDIA向用户隐藏了他们的错误(。

相关内容

最新更新

热门标签：