如何在CUDA中使用1D阵列的纹理存储器

我写了下面的代码，看看如何使用纹理内存的1D数组。但是tex1D函数没有从数组中获取对应线程id的值。请纠正这个代码，并告诉我如何使用纹理记忆的1D阵列高效和有效。

__global__ void sum(float *b,cudaTextureObject_t texObj)
    {
    b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
    //printf("n%fn",tex1Dfetch<float>(texObj,threadIdx.x));
    }
    int main()
    {
    float *a,*b;
    float *d_a,*d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);
    for(i=0;i<5;i++)
        a[i]=i;
    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);
    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);

    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;

      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;
        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);

    cudaMalloc(&d_b, 5* sizeof(float));
    sum<<<1,5>>>(d_b,texObj);

        // Free device memory
    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
     for(i=0;i<5;i++)
        printf("%ft",b[i]);
      cudaDestroyTextureObject(texObj); 
    cudaFreeArray(cuArray);
    cudaFree(d_b);
        return 0;
    }

至少有两个问题:

你只复制一个浮点数从设备到主机在结束:

cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
                 ^^^^^^^^^^^^^

如果你想打印5个值，你应该复制5个值回来:

cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);

您已选择归一化坐标:

texDesc.normalizedCoords = 1;

这意味着您应该传递0到1之间的浮点坐标作为索引，而不是0到4之间的整数坐标:

 b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
                                    ^^^^^^^^^^^

可以这样写:

 b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));

通过这些更改，我得到了合理的结果。下面是完整的代码:

$ cat t3.cu
#include <stdio.h>
__global__ void sum(float *b,cudaTextureObject_t texObj)
    {
    b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));
    //printf("n%fn",tex1Dfetch<float>(texObj,threadIdx.x));
    }

int main()
    {
    float *a,*b;
    float *d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);
    for(i=0;i<5;i++)
        a[i]=i;
    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);
    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);

    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;

      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;
        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);

    cudaMalloc(&d_b, 5* sizeof(float));
    sum<<<1,4>>>(d_b,texObj);

        // Free device memory
    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
     for(i=0;i<4;i++)
        printf("%ft",b[i]);
      printf("n");
      cudaDestroyTextureObject(texObj);
    cudaFreeArray(cuArray);
    cudaFree(d_b);
        return 0;
    }
$ nvcc -arch=sm_61 -o t3 t3.cu
$ cuda-memcheck ./t3
========= CUDA-MEMCHECK
0.500000        1.500000        2.500000        3.500000
========= ERROR SUMMARY: 0 errors
$

注意，我确实做了一些其他的改变。特别是，我已经调整了您的样本点以及样本数量，以选择在您拥有的5个数据点(0,1,2,3,4)中的每个数据点之间线性插值的样本点，产生4个数量(0.5,1.5,2.5,3.5)的总输出，代表您的5个数据点之间的中点。

如果您想了解更多关于标准化坐标索引的信息，可以在编程指南中了解，还有其他概念，如边界模式等。此外，还有各种CUDA示例代码来演示纹理的正确使用。

相关内容

最新更新

热门标签：