CUDA错误消息:配置参数无效



我是CUDA的新手。我试着用一个简单的练习来帮助我熟悉它。我编写了一个小程序"寻找素数"。它几乎完成了,但有一个问题我无法解决。我发现我能找到的最大数字是1027。如果我输入超过1027,我会得到错误消息:

getPrimeKernel launch failed!!: invalid configuration argument
findPrimeWithCuda failed!!

我应该在哪里调整代码?非常感谢。

这是我的代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <vector>
using namespace std;
cudaError_t findPrimeWithCuda(bool *c, int *a, unsigned int size);
__host__ __device__ bool checkPrime(int i)
{
    for (int m = 2; m <= i - 1; m++)
    {
        if (i%m == 0) return true;
    }
    return false;
}
__global__ void getPrimeKernel(bool *c, int *a)
{
    int i = threadIdx.x;
    c[i] = checkPrime(a[i]);
}
void cudaGetPrime(int i)
{
    i = i - 3;
    int *arr = (int *)malloc((size_t)(i * sizeof(int)));
    bool *rst = (bool *)malloc((size_t)(i * sizeof(bool)));
    for (int j = 0; j <= i; j++) arr[j] = j + 3;
    cudaError_t cudaStatus = findPrimeWithCuda(rst, arr, i);
    if (cudaStatus != cudaSuccess) fprintf(stderr,"findPrimeWithCuda failed!!");
}
void w_CudaArray(int lastNum)
{
    time_t t1 = time(NULL);
    cudaGetPrime(lastNum);
    time_t t2 = time(NULL);
    printf("Time to spent : %d secondn", t2 - t1);
    cout << "Computing with CUDA to count the prime numbers ends!!" << endl << endl;
}
int main()
{
    int lastNum = 0;
    cout << "The final number which you want to find the prime numbers : ";
    cin >> lastNum;
    w_CudaArray(lastNum);
}
cudaError_t findPrimeWithCuda(bool *c, int *a, unsigned int size)
{
    int *dev_a = 0;
    bool *dev_c = false;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaSetDevice failed!!");
        goto Error;
    }
    size_t totalm, freem;
    float free_m, total_m, used_m;
    cudaMemGetInfo(&freem, &totalm);
    free_m = (size_t)freem / 1048576.0;
    total_m = (size_t)totalm / 1048576.0;
    used_m = total_m - free_m;
    cout << "Total memory = " << total_m << " MB" << endl;
    cout << "Used memory  = " << used_m << " MB" << endl;
    cout << "Free memory  = " << free_m << " MB" << endl;
    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc dev_a failed!!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(bool));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc dev_c failed!!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy dev_a failed!!");
        goto Error;
    }
    getPrimeKernel<<<1, size>>>(dev_c, dev_a);
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "getPrimeKernel launch failed!!: %sn", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceSynchorinze returned error code %d after launching getPrimeKernel!n", cudaStatus);
        goto Error;
    }
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(bool), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemory failed!");
        goto Error;
    }
    int trueNumber = 0;
    for (int i = 0; i < size; i++)
    {
        if (c[i] == false) trueNumber++;
    }
    cout << "There are " << trueNumber + 2 << " prime numbers!!" << endl;
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!!");
    }
Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    return cudaStatus;
}

调用内核时,size是每个块的线程数(请参阅《编程指南》中关于内核调用语法的部分)。但每个块的最大线程数是有限的,这取决于设备的计算能力。您可能有一个设备,每个块最多有1024个线程。因此,如果使用更大的数字(在您的情况下为1027),它将不再有效。维基百科上的这个表列出了每个块的最大线程数,具体取决于设备。

您可以使用cudaDeviceGetProperties查询每个块的最大线程数,并查看maxThreadsPerBlock字段。

为了处理大于每个块的线程数的输入大小,您需要。。。更多的方块!这意味着您还必须使用更大的网格大小。因此,您必须为您的输入计算适当的网格和块大小,并将这些作为参数传递给内核启动。

例如,如下所示:

int inputSize = ...; // The size of the input data
int threadsPerBlock = 256; // May be queried from the device properties
int blocksPerGrid = (inputSize + threadsPerBlock - 1) / threadsPerBlock;
callKernel<<<blocksPerGrid, threadsPerBlock>>>(...);

这只表明了基本的想法。有关详细信息,请参阅CUDA编程指南或可用的示例。例如,您还必须使用全局线程索引,并确保不会访问无效的内存区域。这可以通过将输入数组的长度传递给内核来实现,并检查您是否仍在边界内:

__global__ void callKernel(int *array, int arrayLength) {
    // Make sure to compute the GLOBAL thread index like this:
    int i = blockDim.x*blockIdx.x + threadIdx.x
    if (i >= arrayLength) {
        return;
    }
    ....
}

(题外话:对于练习来说,任何事情都可以,但我不确定这种形式的素数查找是否真的很适合CUDA。你可以考虑看看更"经典"的CUDA例子,比如向量加法或矩阵乘法…)

我修改了部分代码,问题得到了解决。

__global__ void getPrimeKernel(bool *c, int *a, int size)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= size) return;
    c[i] = checkPrime(a[i]);
}

和这个

cudaDeviceProp myCUDA;
if (cudaGetDeviceProperties(&myCUDA, 0) == cudaSuccess)
{
    printf("Using device %d:n", 0);
    printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHzn",
        myCUDA.name, (int)myCUDA.totalGlobalMem, (int)myCUDA.major,
        (int)myCUDA.minor, (int)myCUDA.clockRate);
}
int threadsPerBlock = myCUDA.maxThreadsPerBlock;
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
cout << "Maxium number per block = " << threadsPerBlock << endl;
cout << "Blocks per Grid = " << blocksPerGrid << endl;
getPrimeKernel<<<blocksPerGrid, threadsPerBlock>>>(dev_c, dev_a, size);

现在它可以计算超过1027的数字。希望这部分代码可以帮助像我这样的新手。:)

相关内容

最新更新