我是CUDA的新手。我试着用一个简单的练习来帮助我熟悉它。我编写了一个小程序"寻找素数"。它几乎完成了,但有一个问题我无法解决。我发现我能找到的最大数字是1027。如果我输入超过1027,我会得到错误消息:
getPrimeKernel launch failed!!: invalid configuration argument
findPrimeWithCuda failed!!
我应该在哪里调整代码?非常感谢。
这是我的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <vector>
using namespace std;
cudaError_t findPrimeWithCuda(bool *c, int *a, unsigned int size);
__host__ __device__ bool checkPrime(int i)
{
for (int m = 2; m <= i - 1; m++)
{
if (i%m == 0) return true;
}
return false;
}
__global__ void getPrimeKernel(bool *c, int *a)
{
int i = threadIdx.x;
c[i] = checkPrime(a[i]);
}
void cudaGetPrime(int i)
{
i = i - 3;
int *arr = (int *)malloc((size_t)(i * sizeof(int)));
bool *rst = (bool *)malloc((size_t)(i * sizeof(bool)));
for (int j = 0; j <= i; j++) arr[j] = j + 3;
cudaError_t cudaStatus = findPrimeWithCuda(rst, arr, i);
if (cudaStatus != cudaSuccess) fprintf(stderr,"findPrimeWithCuda failed!!");
}
void w_CudaArray(int lastNum)
{
time_t t1 = time(NULL);
cudaGetPrime(lastNum);
time_t t2 = time(NULL);
printf("Time to spent : %d secondn", t2 - t1);
cout << "Computing with CUDA to count the prime numbers ends!!" << endl << endl;
}
int main()
{
int lastNum = 0;
cout << "The final number which you want to find the prime numbers : ";
cin >> lastNum;
w_CudaArray(lastNum);
}
cudaError_t findPrimeWithCuda(bool *c, int *a, unsigned int size)
{
int *dev_a = 0;
bool *dev_c = false;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed!!");
goto Error;
}
size_t totalm, freem;
float free_m, total_m, used_m;
cudaMemGetInfo(&freem, &totalm);
free_m = (size_t)freem / 1048576.0;
total_m = (size_t)totalm / 1048576.0;
used_m = total_m - free_m;
cout << "Total memory = " << total_m << " MB" << endl;
cout << "Used memory = " << used_m << " MB" << endl;
cout << "Free memory = " << free_m << " MB" << endl;
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dev_a failed!!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(bool));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dev_c failed!!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy dev_a failed!!");
goto Error;
}
getPrimeKernel<<<1, size>>>(dev_c, dev_a);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "getPrimeKernel launch failed!!: %sn", cudaGetErrorString(cudaStatus));
goto Error;
}
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchorinze returned error code %d after launching getPrimeKernel!n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(bool), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemory failed!");
goto Error;
}
int trueNumber = 0;
for (int i = 0; i < size; i++)
{
if (c[i] == false) trueNumber++;
}
cout << "There are " << trueNumber + 2 << " prime numbers!!" << endl;
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!!");
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
return cudaStatus;
}
调用内核时,size
是每个块的线程数(请参阅《编程指南》中关于内核调用语法的部分)。但每个块的最大线程数是有限的,这取决于设备的计算能力。您可能有一个设备,每个块最多有1024个线程。因此,如果使用更大的数字(在您的情况下为1027),它将不再有效。维基百科上的这个表列出了每个块的最大线程数,具体取决于设备。
您可以使用cudaDeviceGetProperties查询每个块的最大线程数,并查看maxThreadsPerBlock字段。
为了处理大于每个块的线程数的输入大小,您需要。。。更多的方块!这意味着您还必须使用更大的网格大小。因此,您必须为您的输入计算适当的网格和块大小,并将这些作为参数传递给内核启动。
例如,如下所示:
int inputSize = ...; // The size of the input data
int threadsPerBlock = 256; // May be queried from the device properties
int blocksPerGrid = (inputSize + threadsPerBlock - 1) / threadsPerBlock;
callKernel<<<blocksPerGrid, threadsPerBlock>>>(...);
这只表明了基本的想法。有关详细信息,请参阅CUDA编程指南或可用的示例。例如,您还必须使用全局线程索引,并确保不会访问无效的内存区域。这可以通过将输入数组的长度传递给内核来实现,并检查您是否仍在边界内:
__global__ void callKernel(int *array, int arrayLength) {
// Make sure to compute the GLOBAL thread index like this:
int i = blockDim.x*blockIdx.x + threadIdx.x
if (i >= arrayLength) {
return;
}
....
}
(题外话:对于练习来说,任何事情都可以,但我不确定这种形式的素数查找是否真的很适合CUDA。你可以考虑看看更"经典"的CUDA例子,比如向量加法或矩阵乘法…)
我修改了部分代码,问题得到了解决。
__global__ void getPrimeKernel(bool *c, int *a, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= size) return;
c[i] = checkPrime(a[i]);
}
和这个
cudaDeviceProp myCUDA;
if (cudaGetDeviceProperties(&myCUDA, 0) == cudaSuccess)
{
printf("Using device %d:n", 0);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHzn",
myCUDA.name, (int)myCUDA.totalGlobalMem, (int)myCUDA.major,
(int)myCUDA.minor, (int)myCUDA.clockRate);
}
int threadsPerBlock = myCUDA.maxThreadsPerBlock;
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
cout << "Maxium number per block = " << threadsPerBlock << endl;
cout << "Blocks per Grid = " << blocksPerGrid << endl;
getPrimeKernel<<<blocksPerGrid, threadsPerBlock>>>(dev_c, dev_a, size);
现在它可以计算超过1027的数字。希望这部分代码可以帮助像我这样的新手。:)