如何在CUDA中创建和使用1D分层纹理

我是CUDA新手。我已经知道如何在CUDA中制作1D和2D纹理。然而，我正在努力如何使用1D分层纹理。我的内核使用纹理的输出是全零，这绝对是不正确的。然而，我不确定我做错了什么。我很怀疑我是否正确设置了这个纹理，但我检查了cuda错误，没有发现任何问题。有人能告诉我如何正确地设置和使用1D分层纹理吗?这是我的代码。提前感谢:

// To Compile: nvcc backproj.cu -o backproj.out
// To Run: ./backproj.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) {
    unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (location_idx < numlocations) {
        // Get the location you want to interpolate from the array
        float loc2find = (float) d_locations[location_idx] + 0.5f;
        // Read from texture and write to global memory
        d_output[location_idx] = tex1DLayered(texRef, loc2find, layer);
    }
}
// Host code
int main()
{
    // Setup h_data and locations to interpolate from
    const unsigned int len = 10;
    const unsigned int numlayers = 3;
    const unsigned int upsamp = 3;
    const unsigned int loclen = 1 + (len - 1) * upsamp;
    float idx_spacing = 1/(float)upsamp;
    float h_data[len][numlayers], h_loc[loclen];
    for (int i = 0; i < len; i++) 
        for (int j = 0; j < numlayers; j++)
            h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));
    for (int i = 0; i < loclen; i ++) 
        h_loc[i] = i*idx_spacing;
    // Get the memory locations you want
    float* d_loc;
    cudaMalloc(&d_loc, loclen * sizeof(float));
    cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
    // Allocate CUDA array in device memory
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, len, numlayers);
    // Copy to device memory some data located at address h_data in host memory 
    cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice);
    // Set texture reference parameters
    texRef.addressMode[0] = cudaAddressModeBorder;
    texRef.filterMode = cudaFilterModeLinear;
    texRef.normalized = false;
    // Bind the array to the texture reference
    cudaBindTextureToArray(texRef, cuArray, channelDesc);
    // Allocate result of transformation in device memory
    float* d_output;
    cudaMalloc(&d_output, loclen * sizeof(float));
    // Invoke kernel
    int thdsPerBlk = 256;
    int blksPerGrid = (int) (loclen / thdsPerBlk) + 1;
    printf("Threads Per Block: %d, Blocks Per Grid: %dn", thdsPerBlk, blksPerGrid);
    interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0);
    // Print Results
    printf("n Original Indices n");
    for (int i = 0; i < len; i++) printf("    %d ", i);
    printf("n Original array n");
    for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]);
    printf("n Output Indices n");
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
    printf("n Output Array n");
    cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
    printf("n");
    // Free device memory
    cudaFreeArray(cuArray);
    cudaFree(d_output);
    return 0;
}

您必须使用cudaMalloc3DArray和cudaArrayLayered标志设置来分配分层纹理的内存。在工具包示例中有一个完整的分层纹理使用示例，您可以研究它以了解它们是如何工作的。

不幸的是，CUDA SDK只告诉你如何在你有2D分层纹理时做到这一点。当涉及到1D分层纹理时，还有一些更棘手的问题。当生成extentDesc时，您必须在make_cudaExtent的第二个参数中输入0，如下所示:

cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers);  // <-- 0 height required for 1Dlayered

但是，当将make_cudaExtent用于mParams.extent用于cudaMemcpy3D时，您仍然需要为第二个参数添加1:

mParams.extent = make_cudaExtent(len, 1, numlayers);  // <<-- non zero height required for memcpy to do anything

此外，还有一些其他不明显的细节，如make_cudaPitchedPtr的音调。所以我已经包含了我的完整和功能代码的1D分层纹理。我在任何地方都找不到这样的例子。所以希望这能帮助到其他和你有同样处境的人:

// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out
// To Run: ./layeredTexture1D.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures: x is for input values, y is for corresponding output values
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) {
    unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y;
    if (location_idx < numlocations && layer < numlayers) {
        // Get the location you want to interpolate from the array
        float loc2find = (float)d_locations[location_idx] + 0.5f;
        // Read from texture and write to global memory
        d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer);
        //printf("location=%d layer=%d loc2find=%f  result=%f n", location_idx, layer, loc2find, d_output[location_idx]);
    }
}
// Host code
int main()
{
    // Setup h_data and locations to interpolate from
    const unsigned int len = 7;
    const unsigned int numlayers = 3;
    const unsigned int upsamp = 4;
    const unsigned int loclen = 1 + (len - 1) * upsamp;
    float idx_spacing = 1 / (float)upsamp;
    float h_data[numlayers*len], h_loc[loclen];
    for (int i = 0; i < len; i++)
        for (int j = 0; j < numlayers; j++)
            h_data[len*j + i] = 1 + cosf((float)pi*i / (j + 1.0f));
    for (int i = 0; i < loclen; i++)
        h_loc[i] = i*idx_spacing;
    // Get the memory locations you want
    float* d_loc;
    cudaMalloc(&d_loc, loclen * sizeof(float));
    cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
    // Allocate CUDA array in device memory
    cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers);  // <-- 0 height required for 1Dlayered
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaMemcpy3DParms mParams = { 0 };
    mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1);
    mParams.kind = cudaMemcpyHostToDevice;
    mParams.extent = make_cudaExtent(len, 1, numlayers);  // <<-- non zero height required for memcpy to do anything
    cudaArray* cuArray;
    cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered);
    mParams.dstArray = cuArray;
    cudaMemcpy3D(&mParams);
    // Set texture reference parameters
    texRef.addressMode[0] = cudaAddressModeBorder;
    texRef.filterMode = cudaFilterModeLinear;
    texRef.normalized = false;
    // Bind the array to the texture reference
    cudaBindTextureToArray(texRef, cuArray, channelDesc);
    // Allocate result of transformation in device memory
    float *d_output;
    cudaMalloc(&d_output, loclen * numlayers * sizeof(float));
    float h_output[loclen * numlayers];
    // Invoke kernel
    dim3 dimBlock(16, 16, 1);
    dim3 dimGrid((loclen + dimBlock.x - 1) / dimBlock.x,
        (numlayers + dimBlock.y - 1) / dimBlock.y, 1);
    interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers);
    // Print Results
    printf("n Original Indices n");
    for (int i = 0; i < len; i++) printf("    %d ", i);
    printf("n Original array n");
    for (int j = 0; j < numlayers; j++) {
        for (int i = 0; i < len; i++) {
            printf("%5.3f ", h_data[i + j*len]);
        }
        printf("n");
    }
    printf("n Output Indices n");
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
    printf("n Output Array n");
    cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost);
    for (int j = 0; j < numlayers; j++) {
        for (int i = 0; i < loclen; i++) {
            printf("%5.3f ", h_output[i + j*loclen]);
        }
        printf("n");
    }
    printf("n");
    // Free device memory
    cudaFreeArray(cuArray);
    cudaFree(d_output);
    return 0;
}

相关内容

最新更新

热门标签：