我正在运行这段代码来排序大数组的ip使用thrust stable_sort和用户定义的操作符来比较ip。这段代码适用于小于50000个ip的数组,但我得到了大数组的内存错误。下面是我使用的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
template<typename T>
struct vector_less
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
__host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
if (lhs[0] == rhs[0])
if (lhs[1] == rhs[1])
if (lhs[2] == rhs[2])
return lhs[3] < rhs[3];
else
return lhs[2] < rhs[2];
else
return lhs[1] < rhs[1];
else
return lhs[0] < rhs[0];
}
};
__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
int thread = threadIdx.x + blockIdx.x * blockDim.x;
if (thread < searchedIpsSize)
{
dev_sorted_Ips[thread] = new unsigned char[4];
dev_sorted_Ips[thread][0] = ip_b1[thread];
dev_sorted_Ips[thread][1] = ip_b2[thread];
dev_sorted_Ips[thread][2] = ip_b3[thread];
dev_sorted_Ips[thread][3] = ip_b4[thread];
}
}
int main()
{
const int size = 1000000;
unsigned char * ip_b1 = new unsigned char[size];
unsigned char * ip_b2 = new unsigned char[size];;
unsigned char * ip_b3 = new unsigned char[size];;
unsigned char * ip_b4 = new unsigned char[size];;
unsigned char * dev_ip_b1;
unsigned char * dev_ip_b2;
unsigned char * dev_ip_b3;
unsigned char * dev_ip_b4;
unsigned char ** dev_sortedIps;
for (int i = 0; i < size; i++)
{
ip_b1[i] = rand() % 240;
ip_b2[i] = rand() % 240;
ip_b3[i] = rand() % 240;
ip_b4[i] = rand() % 240;
}
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
int resetThreads = size;
int resetBlocks = 1;
if (size > 1024)
{
resetThreads = 1024;
resetBlocks = size / 1024;
if (size % 1024 > 0)
resetBlocks++;
}
prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);
thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "launch failed: %sn", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !n", cudaStatus);
goto Error;
}
return 0;
Error:
cudaFree(dev_ip_b1);
cudaFree(dev_ip_b2);
cudaFree(dev_ip_b3);
cudaFree(dev_ip_b4);
cudaFree(dev_sortedIps);
}
得到的错误是:Microsoft c++ exception: thrust::system::system_error at memory location
如何解决大数组的这个问题?我是否应该使用另一种技术来实现这种排序,比如对部分进行除法和排序,然后再合并?
最近的问题是内核内的malloc
和new
在设备堆的大小上是有限的,它们可以用来分配。这个限制可以提高。请阅读文档
其他一些建议:
-
在内核之后(在第一个推力调用之前)没有做任何错误检查。您应该对内核进行错误检查,然后您会发现是内核出了问题,而推力只是为您报告错误。避免混淆。当你遇到cuda代码的问题时,做严格的,正确的cuda错误检查。
-
作为一个好的实践,至少对于调试目的来说,测试
new
或malloc
返回的任何指针是否为NULL并不是一个坏主意。这就是API通知您发生分配失败的方式。
下面的代码演示了一种可能的解决方法,通过调整设备堆的输入大小。它还演示了解决其他两个建议的可能方法:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <device_functions.h>
#include <assert.h>
template<typename T>
struct vector_less
{
typedef T first_argument_type;
typedef T second_argument_type;
typedef bool result_type;
__host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
if (lhs[0] == rhs[0])
if (lhs[1] == rhs[1])
if (lhs[2] == rhs[2])
return lhs[3] < rhs[3];
else
return lhs[2] < rhs[2];
else
return lhs[1] < rhs[1];
else
return lhs[0] < rhs[0];
}
};
__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
int thread = threadIdx.x + blockIdx.x * blockDim.x;
if (thread < searchedIpsSize)
{
dev_sorted_Ips[thread] = new unsigned char[4];
if (dev_sorted_Ips[thread] == NULL) assert(0);
dev_sorted_Ips[thread][0] = ip_b1[thread];
dev_sorted_Ips[thread][1] = ip_b2[thread];
dev_sorted_Ips[thread][2] = ip_b3[thread];
dev_sorted_Ips[thread][3] = ip_b4[thread];
}
}
int main(int argc, char *argv[])
{
int size = 50000;
if (argc > 1) size = atoi(argv[1]);
int chunks = size/50000 + 1;
cudaError_t cudaStatus;
cudaStatus = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8000000 * chunks);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "set device heap limit failed!");
}
unsigned char * ip_b1 = new unsigned char[size];
unsigned char * ip_b2 = new unsigned char[size];;
unsigned char * ip_b3 = new unsigned char[size];;
unsigned char * ip_b4 = new unsigned char[size];;
unsigned char * dev_ip_b1;
unsigned char * dev_ip_b2;
unsigned char * dev_ip_b3;
unsigned char * dev_ip_b4;
unsigned char ** dev_sortedIps;
for (int i = 0; i < size; i++)
{
ip_b1[i] = rand() % 240;
ip_b2[i] = rand() % 240;
ip_b3[i] = rand() % 240;
ip_b4[i] = rand() % 240;
}
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
int resetThreads = size;
int resetBlocks = 1;
if (size > 1024)
{
resetThreads = 1024;
resetBlocks = size / 1024;
if (size % 1024 > 0)
resetBlocks++;
}
prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess){
printf(" kernel failn");
exit(0);}
thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "launch failed: %sn", cudaGetErrorString(cudaStatus));
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !n", cudaStatus);
}
return 0;
}
注意,您可以通过将所需的大小作为命令行参数传递来测试各种大小。我测试了高达1000000,它似乎工作良好。最终,对于足够大的问题规模,您将耗尽GPU上的内存。你没有说明你有什么GPU
我已经删除了goto语句,因为我在linux上工作(显然你已经切换回windows)。我建议你想出一个不同的错误处理过程,而不是使用goto,如果没有其他原因,它会导致使用推力结构的困难。
还要注意内核中的new
或malloc
有点"慢"。对于更大的大小,您可以通过使用单个适当大小的cudaMalloc
调用预先进行必要的分配来大大加快此速度。不幸的是,由于使用双指针数组dev_sorted_Ips
,这使情况变得复杂。我建议您将其扁平化为单个指针数组,通过cudaMalloc
分配一次必要的大小,并在内核中执行必要的数组索引以使其工作。如果您对这段代码进行分析,您会发现对于更长的情况(例如size = 1000000),绝大多数执行时间是由prepare_ips_list
内核消耗的,而不是排序操作。因此,您应该从这里开始努力改进性能。