我是CUDA新手。我已经知道如何在CUDA中制作1D和2D纹理。然而,我正在努力如何使用1D分层纹理。我的内核使用纹理的输出是全零,这绝对是不正确的。然而,我不确定我做错了什么。我很怀疑我是否正确设置了这个纹理,但我检查了cuda错误,没有发现任何问题。有人能告诉我如何正确地设置和使用1D分层纹理吗?这是我的代码。提前感谢:
// To Compile: nvcc backproj.cu -o backproj.out
// To Run: ./backproj.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (location_idx < numlocations) {
// Get the location you want to interpolate from the array
float loc2find = (float) d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx] = tex1DLayered(texRef, loc2find, layer);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 10;
const unsigned int numlayers = 3;
const unsigned int upsamp = 3;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1/(float)upsamp;
float h_data[len][numlayers], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));
for (int i = 0; i < loclen; i ++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, len, numlayers);
// Copy to device memory some data located at address h_data in host memory
cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float* d_output;
cudaMalloc(&d_output, loclen * sizeof(float));
// Invoke kernel
int thdsPerBlk = 256;
int blksPerGrid = (int) (loclen / thdsPerBlk) + 1;
printf("Threads Per Block: %d, Blocks Per Grid: %dn", thdsPerBlk, blksPerGrid);
interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0);
// Print Results
printf("n Original Indices n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("n Original array n");
for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]);
printf("n Output Indices n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("n Output Array n");
cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}
您必须使用cudaMalloc3DArray
和cudaArrayLayered
标志设置来分配分层纹理的内存。在工具包示例中有一个完整的分层纹理使用示例,您可以研究它以了解它们是如何工作的。
不幸的是,CUDA SDK只告诉你如何在你有2D分层纹理时做到这一点。当涉及到1D分层纹理时,还有一些更棘手的问题。当生成extentDesc
时,您必须在make_cudaExtent
的第二个参数中输入0,如下所示:
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
但是,当将make_cudaExtent
用于mParams.extent
用于cudaMemcpy3D
时,您仍然需要为第二个参数添加1:
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
此外,还有一些其他不明显的细节,如make_cudaPitchedPtr
的音调。所以我已经包含了我的完整和功能代码的1D分层纹理。我在任何地方都找不到这样的例子。所以希望这能帮助到其他和你有同样处境的人:
// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out
// To Run: ./layeredTexture1D.out
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define pi acos(-1)
// 1D float textures: x is for input values, y is for corresponding output values
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;
// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) {
unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y;
if (location_idx < numlocations && layer < numlayers) {
// Get the location you want to interpolate from the array
float loc2find = (float)d_locations[location_idx] + 0.5f;
// Read from texture and write to global memory
d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer);
//printf("location=%d layer=%d loc2find=%f result=%f n", location_idx, layer, loc2find, d_output[location_idx]);
}
}
// Host code
int main()
{
// Setup h_data and locations to interpolate from
const unsigned int len = 7;
const unsigned int numlayers = 3;
const unsigned int upsamp = 4;
const unsigned int loclen = 1 + (len - 1) * upsamp;
float idx_spacing = 1 / (float)upsamp;
float h_data[numlayers*len], h_loc[loclen];
for (int i = 0; i < len; i++)
for (int j = 0; j < numlayers; j++)
h_data[len*j + i] = 1 + cosf((float)pi*i / (j + 1.0f));
for (int i = 0; i < loclen; i++)
h_loc[i] = i*idx_spacing;
// Get the memory locations you want
float* d_loc;
cudaMalloc(&d_loc, loclen * sizeof(float));
cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);
// Allocate CUDA array in device memory
cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers); // <-- 0 height required for 1Dlayered
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaMemcpy3DParms mParams = { 0 };
mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1);
mParams.kind = cudaMemcpyHostToDevice;
mParams.extent = make_cudaExtent(len, 1, numlayers); // <<-- non zero height required for memcpy to do anything
cudaArray* cuArray;
cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered);
mParams.dstArray = cuArray;
cudaMemcpy3D(&mParams);
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = false;
// Bind the array to the texture reference
cudaBindTextureToArray(texRef, cuArray, channelDesc);
// Allocate result of transformation in device memory
float *d_output;
cudaMalloc(&d_output, loclen * numlayers * sizeof(float));
float h_output[loclen * numlayers];
// Invoke kernel
dim3 dimBlock(16, 16, 1);
dim3 dimGrid((loclen + dimBlock.x - 1) / dimBlock.x,
(numlayers + dimBlock.y - 1) / dimBlock.y, 1);
interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers);
// Print Results
printf("n Original Indices n");
for (int i = 0; i < len; i++) printf(" %d ", i);
printf("n Original array n");
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < len; i++) {
printf("%5.3f ", h_data[i + j*len]);
}
printf("n");
}
printf("n Output Indices n");
for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
printf("n Output Array n");
cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 0; j < numlayers; j++) {
for (int i = 0; i < loclen; i++) {
printf("%5.3f ", h_output[i + j*loclen]);
}
printf("n");
}
printf("n");
// Free device memory
cudaFreeArray(cuArray);
cudaFree(d_output);
return 0;
}