甚至使用一个线程的内核在GPU上进行奇怪的排序行为,并具有顺序实现


__global__ void sort_single(int *size , int *arr){
for ( int m = 0; m < *size / 2; m++)
{
    for (int  i = 0; i < *size; i += 2)
    {
        if (arr[i + 1] > arr[i])
        {
            int temp = arr[i];
            arr[i] = arr[i + 1];
            arr[i + 1] = temp;
        }
    }
    /*for (int i = 0; i < size; i++)
    printf("%d ", arr[i]);
    printf("n");*/
    for ( int i = 1; i < *size; i += 2)
    {
        if (arr[i + 1] > arr[i])
        {
            int temp = arr[i];
            arr[i] = arr[i + 1];
            arr[i + 1] = temp;
        }
    }
 }
}

这是Cuda GPU的内核代码。为了调用它,我在准备所有数据后从Main使用它。 sort_single<<<1,1>>>(d_a,d_b); 我的问题是,为什么在此处给出错误的结果,而如果我将此代码作为常规C 功能代码运行,则给出了正确的结果。如果我删除内核中的外循环并在该循环中调用内核下面。

for ( int m = 0; m < N / 2; m++)
sort_single<<<1,1>>>(d_a,d_b);

我在这里做同样的事情我认为它必须对此算法所需的步骤数量做一些事情,例如,它需要每次迭代2个步骤。

  1. 将偶数索引与下一个索引进行比较。

  2. 将奇数索引与下一个索引进行比较。

我无法理解为什么会增加数组中的元素数量,因为我使用了单个GPU线程。我需要清楚地了解GPU单线与CPU的不同,以了解当前行为。驱动器上的整个单个文件链接

这是文件内容:

#include "stdio.h"
__global__ void add(int *a , int *b ,int*c){
        c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
__global__ void sort_single(int *size , int *arr){
    for ( int m = 0; m < *size / 2; m++)
    {   
        for (int  i = 0; i < *size; i += 2)
        {
            if (arr[i + 1] > arr[i])
            {
                int temp = arr[i];
                arr[i] = arr[i + 1];
                arr[i + 1] = temp;
            }
        }
        /*for (int i = 0; i < size; i++)
        printf("%d ", arr[i]);
        printf("n");*/
        for ( int i = 1; i < *size; i += 2)
        {
            if (arr[i + 1] > arr[i])
            {
                int temp = arr[i];
                arr[i] = arr[i + 1];
                arr[i + 1] = temp;
            }
        }
    }
}
void random_ints(int *a, int N)
{
   int i;
   for (i = 0; i < N; ++i)
    a[i] = rand() %5000;
}
void uniform_ints(int *a, int N)
{
   int i;
   for (i = 0; i < N; ++i)
    a[i] = i+1;
}
int main(int argc , char**argv){
    int N = 8;
    if(argc>1)
      {
          N=atoi(argv[1]);
      }
    int *a , *b  ;
    int *d_a , *d_b ;
    int isize = N * sizeof(int);
    a = (int *)malloc(sizeof(int));a[0] = N;
    b = (int *)malloc(isize);uniform_ints(b , N);

    cudaError_t cudaStatus;
    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&d_a,sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&d_b,isize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }    
    cudaStatus = cudaMemcpy(d_a, a , sizeof(int),cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(d_b, b , isize,cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    sort_single<<<1,1>>>(d_a,d_b);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %sn", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!n", cudaStatus);
        goto Error;
    }
    cudaStatus = cudaMemcpy(b, d_b , isize,cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    for (int i = 0; i < N; i++)
        printf("%d ", b[i]);
    printf("n");
    Error:
    cudaFree(d_a);
    cudaFree(d_b);
    return cudaStatus;
}

该代码和cuda内核中的代码基于数组的大小均匀的假设。它给出了正确的结果,错误是mallocking。无论如何,感谢您的回复。

最新更新