从用户空间应用程序读取原始GPU内存



我正在尝试从用户空间应用程序读取原始gpu内存。我们的想法是将/sys/bus/pci/devices/[device addr]/resource1从应用程序中映射出来,并对其进行加载和存储。

这里的设备是带有8gb板载内存的Nvidia 3060Ti。BAR配置为可调整大小,因此所有8gb的内存都应该是可访问的:

(base) [xps] pcimem git:(master) ✗ ls -lah /sys/bus/pci/devices/0000:01:00.0/resource*                   
-r--r--r-- 1 root root 4,0K avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource
-rw------- 1 root root  16M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource0
-rw------- 1 root root 8,0G avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource1
-rw------- 1 root root 8,0G avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource1_wc
-rw------- 1 root root  32M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource3
-rw------- 1 root root  32M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource3_wc
-rw------- 1 root root  128 avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource5

使用pcimem访问内存不工作。将0写入一个位置将在下一次读取时返回0,但在随后的任何读取中将返回0x000000005665BDF5。第一次读取后,所有位置的0x000000005665BDF5值相同。

对这些(失败的)读/写进行基准测试似乎表明它们确实到达了GPU。读取延迟约为900ns,接近PCIe往返时间。

我已经尝试了直接使用mmap帧缓冲区(/dev/fb0)并读/写它。这是可行的,并且我看到了类似的读/写延迟。但是,对于我的使用来说,帧缓冲区太小了。

CUDA不工作,因为从设备内存中读取时,GPU会将该页移动到主机。

是否有办法从Linux访问GPU上的内存?

我的目标是能够在用户空间应用程序中映射GPU的内存,并将其用作内存扩展。用户空间应用程序(在CPU上运行)将直接在GPU的内存上分配和访问数据结构。

TIA

看起来您可以使用GDRCopy库,或者至少是它的内核驱动程序。来自网站:

GDRCopy是一个基于GPUDirect的低延迟GPU内存拷贝库RDMA技术,允许CPU直接映射和访问GPU记忆。

解决方案是使用vulcan API在GPU上分配一个堆并访问它。但是,由于x86不能缓存MMIO地址,因此每次访问都将通过PCIe访问GPU。

实现与Nvidia的服务器解决方案具有相同的延迟。

这是一个快速的c++实现,它将GPU抽象为堆内存,并允许malloc()free()在上面。

查看堆类型:http://vulkan.gpuinfo.org/displayreport.php?id=14928#memory

当从createVertexBuffer()调用findMemoryType()时,您需要检查GPU支持哪个标志

#include <chrono>
#include <vulkan/vulkan.h>
#include <algorithm>
#include <array>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <optional>
#include <set>
#include <stdexcept>
#include <vector>
#include "libvram/libvram.hh"
class VRamWrapper;
VRamWrapper *vrw_obj;
const size_t DEV_EXT_LEN = 1;
const char *deviceExtensions[] = {VK_KHR_SWAPCHAIN_EXTENSION_NAME};
struct QueueFamilyIndices {
std::optional<uint32_t> graphicsFamily;
bool isComplete() { return graphicsFamily.has_value(); }
};
class VRamWrapper {
public:
void init() { initVulkan(); }
void *malloc(size_t bytes) { return this->createVertexBuffer(bytes); }
void free(void *buf) { assert(0); }
private:
VkInstance instance;
VkPhysicalDevice physicalDevice = VK_NULL_HANDLE;
VkDevice device;
VkQueue graphicsQueue;
std::vector<VkBuffer> buffers;
std::vector<VkDeviceMemory> bufferMemories;
void initVulkan() {
createInstance();
pickPhysicalDevice();
createLogicalDevice();
}
void cleanup() {
for (auto buf : buffers) {
vkDestroyBuffer(device, buf, nullptr);
}
for (auto mem : bufferMemories) {
vkFreeMemory(device, mem, nullptr);
}
vkDestroyDevice(device, nullptr);
vkDestroyInstance(instance, nullptr);
}
void createInstance() {
VkApplicationInfo appInfo{};
appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
appInfo.pApplicationName = "Hello Triangle";
appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.pEngineName = "No Engine";
appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.apiVersion = VK_API_VERSION_1_0;
VkInstanceCreateInfo createInfo{};
createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
createInfo.pApplicationInfo = &appInfo;
createInfo.enabledLayerCount = 0;
createInfo.pNext = nullptr;
if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) {
throw std::runtime_error("failed to create instance!");
}
}
void pickPhysicalDevice() {
uint32_t deviceCount = 0;
vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);
if (deviceCount == 0) {
throw std::runtime_error("failed to find GPUs with Vulkan support!");
}
std::vector<VkPhysicalDevice> devices(deviceCount);
vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());
for (const auto &device : devices) {
if (isDeviceSuitable(device)) {
physicalDevice = device;
break;
}
}
if (physicalDevice == VK_NULL_HANDLE) {
throw std::runtime_error("failed to find a suitable GPU!");
}
}
void createLogicalDevice() {
QueueFamilyIndices indices = findQueueFamilies(physicalDevice);
std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
std::set<uint32_t> uniqueQueueFamilies = {indices.graphicsFamily.value()};
float queuePriority = 1.0f;
for (uint32_t queueFamily : uniqueQueueFamilies) {
VkDeviceQueueCreateInfo queueCreateInfo{};
queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
queueCreateInfo.queueFamilyIndex = queueFamily;
queueCreateInfo.queueCount = 1;
queueCreateInfo.pQueuePriorities = &queuePriority;
queueCreateInfos.push_back(queueCreateInfo);
}
VkPhysicalDeviceFeatures deviceFeatures{};
VkDeviceCreateInfo createInfo{};
createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
createInfo.queueCreateInfoCount =
static_cast<uint32_t>(queueCreateInfos.size());
createInfo.pQueueCreateInfos = queueCreateInfos.data();
createInfo.pEnabledFeatures = &deviceFeatures;
createInfo.enabledExtensionCount = static_cast<uint32_t>(DEV_EXT_LEN);
createInfo.ppEnabledExtensionNames = deviceExtensions;
createInfo.enabledLayerCount = 0;
if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) !=
VK_SUCCESS) {
throw std::runtime_error("failed to create logical device!");
}
vkGetDeviceQueue(device, indices.graphicsFamily.value(), 0, &graphicsQueue);
}
void *createVertexBuffer(size_t bytes) {
VkBuffer buffer;
VkDeviceMemory bufferMemory;
VkBufferCreateInfo bufferInfo{};
bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
bufferInfo.size = bytes;
bufferInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
throw std::runtime_error("failed to create vertex buffer!");
}
VkMemoryRequirements memRequirements;
vkGetBufferMemoryRequirements(device, buffer, &memRequirements);
assert(memRequirements.size == bytes);
VkMemoryAllocateInfo allocInfo{};
allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
allocInfo.allocationSize = memRequirements.size;
allocInfo.memoryTypeIndex =
findMemoryType(memRequirements.memoryTypeBits,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
if (auto res = vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory);
res != VK_SUCCESS) {
throw std::runtime_error("failed to allocate vertex buffer memory");
}
vkBindBufferMemory(device, buffer, bufferMemory, 0);
void *data;
auto res = vkMapMemory(device, bufferMemory, 0, bytes, 0, &data);
if (res != VK_SUCCESS) {
throw std::runtime_error("Map failed");
}
fprintf(stderr, "Map completed. Allocated %lu MiB at %pn",
(bytes) / (1024UL * 1024), data);
this->buffers.push_back(buffer);
this->bufferMemories.push_back(bufferMemory);
return data;
}
uint32_t findMemoryType(uint32_t typeFilter,
VkMemoryPropertyFlags properties) {
VkPhysicalDeviceMemoryProperties memProperties;
vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);
for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
if ((typeFilter & (1 << i)) &&
(memProperties.memoryTypes[i].propertyFlags & properties) ==
properties) {
return i;
}
}
throw std::runtime_error("failed to find suitable memory type!");
}
bool isDeviceSuitable(VkPhysicalDevice device) {
QueueFamilyIndices indices = findQueueFamilies(device);
bool extensionsSupported = checkDeviceExtensionSupport(device);
return indices.isComplete() &&
extensionsSupported /* && swapChainAdequate */;
}
bool checkDeviceExtensionSupport(VkPhysicalDevice device) {
uint32_t extensionCount;
vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
nullptr);
std::vector<VkExtensionProperties> availableExtensions(extensionCount);
vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
availableExtensions.data());
std::set<std::string> requiredExtensions(deviceExtensions,
deviceExtensions + DEV_EXT_LEN);
for (const auto &extension : availableExtensions) {
requiredExtensions.erase(extension.extensionName);
}
return requiredExtensions.empty();
}
QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) {
QueueFamilyIndices indices;
uint32_t queueFamilyCount = 0;
vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
nullptr);
std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
queueFamilies.data());
int i = 0;
for (const auto &queueFamily : queueFamilies) {
if (queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) {
indices.graphicsFamily = i;
}
if (indices.isComplete()) {
break;
}
i++;
}
return indices;
}
};
void ctor_libvram() {
fprintf(stderr, "%s() calledn", __FUNCTION__);
vrw_obj = new VRamWrapper();
vrw_obj->init();
}
void *libvram::malloc(size_t bytes) {
return vrw_obj->malloc(bytes);
}
void libvram::free(void *ptr) {
vrw_obj->free(ptr);
}

最新更新