库达菲内核不编译



使用 Cudafy 迈出我的第一步,并尝试编写一个函数,该函数将获取其线程的位置并在此基础上将一些 int 值保存到数组元素中。我的代码:

[Cudafy]
public static void GenerateRipples(GThread thread, int[] results)
{
  int threadPosInBlockX = thread.threadIdx.x;
  int threadPosInBlockY = thread.threadIdx.y;
  int blockPosInGridX = thread.blockIdx.x;
  int blockPosInGridY = thread.blockIdx.y;
  int gridSizeX = thread.gridDim.x;
  int gridSizeY = thread.gridDim.y;
  int blockSizeX = thread.blockDim.x;
  int blockSizeY = thread.blockDim.y;
  //int threadX = blockSizeX*blockPosInGridX + threadPosInBlockX;
  //if i use only one variable, everything is fine:
  int threadY = blockSizeY;
  //if i add or multiply anything, it cannot compile:
  //int threadY = blockSizeY*blockPosInGridY + threadPosInBlockY;

//  results[gridSizeX*blockSizeX*threadY + threadX] = 255;
}

所以我无法计算这里的threadY。如果我在计算中使用多个变量,Cudafy 翻译类会抛出错误(CudafyModule cm = CudafyTranslator.Cudafy();抛出 Cudafy.CudafyLanguageException)。

我做错了什么?

更新:这是在 GPU 上运行内核的代码:

public void RunTest2()
{
    GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
    CudafyModule km = CudafyTranslator.Cudafy();
    gpu.LoadModule(km);
    int size = 20 * 20;
    int[] allPixels = new int[size];
    int[] dev_result = gpu.Allocate<int>(size);
    dim3 blocksInGrid = new dim3(5, 5);
    dim3 threadsPerBlock = new dim3(4, 4);
    gpu.Launch(blocksInGrid, threadsPerBlock).GenerateRipples(dev_result);
    gpu.CopyFromDevice(dev_result, allPixels);
    gpu.FreeAll();
}

我们需要看看你是如何启动内核的,上面的代码应该运行得很好。我创建了一个运行良好的测试类,并为您提供了如何准备内核网格/块/线程维度的示例。如果您想查看出色的示例,请下载Cudafy源代码并编译CudafyExample项目,请查看他们如何准备和使用CUDAfy的功能。

**注意:在我发布第一堂课之前,我一定抽了一些很好的东西,我忽略了验证它没有产生内存访问违规!!

修复了下面的类,没有违规。

在Codeproject和StackOverflow上查找很棒的示例。

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using Cudafy;
using Cudafy.Host;
using Cudafy.Translator;
namespace FxKernelTest 
{ 
    public class FxKernTest  
    {
        public GPGPU fxgpu;
        public const int N = 1024 * 64;
        public void ExeTestKernel()
        {
            GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, 0);
            eArchitecture arch = gpu.GetArchitecture();
            CudafyModule km = CudafyTranslator.Cudafy(arch);
            gpu.LoadModule(km);
            int[] host_results = new int[N];
            // Either assign a new block of memory to hold results on device
            var dev_results = gpu.Allocate<int>(N);
            gpu.Set<int>(dev_results);
            // Or fill your array with values first and then
            for (int i = 0; i < N; i++) host_results[i] = i * 3;
            // Copy array with ints to device
            //var dev_filled_results = gpu.CopyToDevice(host_results);
            // 64*16 = 1024 threads per block (which is max for sm_30)
            dim3 threadsPerBlock = new dim3(64, 16);    
            // 8*8 = 64 blocks per grid, 1024 threads per block = kernel launched 65536 times
            dim3 blocksPerGrid = new dim3(8, 8); 
            //var threadsPerBlock = 1024; // this will only give you blockDim.x = 1024, .y = 0, .z = 0
            //var blocksPerGrid = 1;      // just for show
            gpu.Launch(blocksPerGrid, threadsPerBlock, "GenerateRipples", dev_results);
            gpu.CopyFromDevice(dev_results, host_results); 
            // Test our results
            for (int index = 0; index < N; index++)
                if (host_results[index] != index)
                    throw new Exception("Check your indexing math, genius!!!");
        }
        [Cudafy]
        public static void GenerateRipples(GThread thread, int[] results)
        {
            var blockSize = thread.blockDim.x * thread.blockDim.y;
            var offsetToGridY = blockSize * thread.gridDim.x;
            // This took me a few tries, I've never used 4 dimensions into a 1D array beofre :)
            var tid = thread.blockIdx.y * offsetToGridY +       // each Grid Y is 8192 in size
                      thread.blockIdx.x * blockSize +           // each Grid X is 1024 in size
                      thread.threadIdx.y * thread.blockDim.x +  // each Block Y is 64 in size
                      thread.threadIdx.x;                       // index into block

            var threadPosInBlockX = thread.threadIdx.x;
            var threadPosInBlockY = thread.threadIdx.y;
            var blockPosInGridX = thread.blockIdx.x;
            var blockPosInGridY = thread.blockIdx.y;
            var gridSizeX = thread.gridDim.x;
            var gridSizeY = thread.gridDim.y;
            var blockSizeX = thread.blockDim.x;
            var blockSizeY = thread.blockDim.y;
            // this is your code, see how I calculate the actual thread ID above!
            var threadX = blockSizeX * blockPosInGridX + threadPosInBlockX;
            //if i use only one variable, everything is fine:
            var threadY = blockSizeY;
            // this calculates just fine
            threadY = blockSizeY * blockPosInGridY + threadPosInBlockY;
            // hint: use NSight for Visual Studio and look at the NSight output, 
            // it reports access violations and tells you where...
            // if our threadId is within bounds of array size
            // we cause access violation if not
            // (class constants are automatically passed to kernels)
            if (tid < N)
                results[tid] = tid;
        }
    }
}

PTXAS 信息 : 0 字节 gmem ptxas 信息 : 编译条目函数 "生成涟漪"的"sm_30"ptxas 信息:函数属性 产生涟漪 0 字节堆栈帧,0 字节溢出存储,0 字节溢出加载 ptxas 信息:已使用 5 个寄存器,328 字节 cmem[0]

最新更新