CUDA_ERROR_ILLEGAL_ADDRESS访问 CUDA 内核中的变量时



在尝试运行用于计算佛陀分形轨道的内核时,我遇到了CUDA_ERROR_ILLEGAL_ADDRESS异常。

extern "C"
__global__ void exec(int iterations, int size,
                float* inputR,  float* inputI, // Real/Imaginary input
                int* output                    // Output image in one dimension
                ) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    float cR = inputR[i];
    float cI = inputI[i];
    float x = 0;
    float y = 0;
    float outX[1000];
    float outY[1000];
    for (int j = 0; j < iterations; j++) {
        outX[j] = x;
        outY[j] = y;
        float xNew = (x * x) - (y * y) + cR;
        float yNew = (2 * x * y) + cI;
        if (xNew * xNew + yNew * yNew > 4) {
            for (int k = 1; k < j; k++) {
                int curX = (outX[k] + 2 ) * size / 4;
                int curY = (outY[k] + 2 ) * size / 4;
                int idx = curX + size * curY;
                output[idx]++; // <- exception here
            }
            return;
        }
        x = xNew;
        y = yNew;
    }
}

我现在已经尝试了多种方法,但错误甚至似乎不是源于与我最初认为的相反的数组。例如

output[0] = 0;

会正常工作。但是,当我尝试调试idx时(请记住,我最初认为错误与数组有关),我发现我既不能像这样分配idx

output[0] = idx;

也不在 printf 语句中使用它

if (i == 0) {
    printf("%dn", idx);
}

我已经对curXcurY尝试了相同的方法,它们也拒绝工作,但是例如cR可以正常工作而不会出现任何错误。在最里面的循环中分配的变量似乎有问题(我也无法分配k),所以我尝试在函数开始时在所有循环之外声明idx,但无济于事。还是同样的错误。

堆栈跟踪:

Exception in thread "main" jcuda.CudaException: CUDA_ERROR_ILLEGAL_ADDRESS
        at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:330)
        at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1938)
        at fractal.Buddhabrot.<init>(Buddhabrot.java:96)
        at controller.Controller.<init>(Controller.java:10)
        at Main.main(Main.java:8)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:497)
        at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)

常数:

block size            512*1*1
grid size             64 *1*1
iterations            1000
size                  256
inputR, inputI length 64*512
output length         256*256

MCVE:

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.*;
import java.io.File;
import java.util.Random;
import static jcuda.driver.JCudaDriver.*;
public class Stackoverflow {
    public static final int SIZE = 256;
    public static final long NUM_POINTS = 128 * 128 * 128;
    public static final int ITERATIONS = 10000;
    public static final int BLOCK_SIZE = 512;
    public static final int SIM_THREADS = BLOCK_SIZE * 64;
    public static final Random random = new Random();
    public static void main(String[] args) {
        File ptxFile = new File("Buddha.ptx");
        setExceptionsEnabled(true);
        cuInit(0);
        CUdevice device = new CUdevice();
        cuDeviceGet(device, 0);
        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);
        CUmodule module = new CUmodule();
        cuModuleLoad(module, ptxFile.getAbsolutePath());
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "exec");
        cuCtxSetLimit(CUlimit.CU_LIMIT_PRINTF_FIFO_SIZE, 4096);
        float[] inR = new float[SIM_THREADS];
        float[] inI = new float[SIM_THREADS];
        int[] out = new int[SIZE * SIZE];
        CUdeviceptr deviceInputR = new CUdeviceptr();
        cuMemAlloc(deviceInputR, inR.length * Sizeof.FLOAT);
        CUdeviceptr deviceInputI = new CUdeviceptr();
        cuMemAlloc(deviceInputI, inI.length * Sizeof.FLOAT);
        CUdeviceptr deviceOutput = new CUdeviceptr();
        cuMemAlloc(deviceOutput, out.length * Sizeof.INT);
        for (long i = 0; i < NUM_POINTS; i += SIM_THREADS) {
            for (int j = 0; j < SIM_THREADS; j++) {
                inR[j] = random.nextFloat() * 4f - 2f;
                inI[j] = random.nextFloat() * 4f - 2f;
            }
            System.out.println("GPU START");
            cuMemcpyHtoD(deviceInputR, Pointer.to(inR), inR.length * Sizeof.FLOAT);
            cuMemcpyHtoD(deviceInputI, Pointer.to(inI), inI.length * Sizeof.FLOAT);
            Pointer kernelParameters = Pointer.to(
                    Pointer.to(new int[]{ITERATIONS}),
                    Pointer.to(new int[]{SIZE}),
                    Pointer.to(deviceInputR),
                    Pointer.to(deviceInputI),
                    Pointer.to(deviceOutput)
            );
            int gridSize = (int) Math.ceil(((double) SIM_THREADS) / BLOCK_SIZE);
            cuLaunchKernel(function,
                    gridSize, 1, 1,
                    BLOCK_SIZE, 1, 1,
                    0, null,
                    kernelParameters, null
            );
            cuCtxSynchronize();
            System.out.println("GPU END");
        }
        cuMemcpyDtoH(Pointer.to(out), deviceOutput, out.length * Sizeof.INT);
    }
}

在您的"常量"部分中,您已经指出了这一点:

iterations            1000

但是在您的 Java 代码中(在您提供 MCVE 之后),您有这个:

public static final int ITERATIONS = 10000;

这显然会导致内核代码的这一部分中断:

float outX[1000];
float outY[1000];
for (int j = 0; j < iterations; j++) {
    outX[j] = x;
    outY[j] = y;

由于 10000 对于 iterations 是索引越界。(此循环的范围实际上是与数据相关的,但对于某些数据输入模式,循环将遍历超过 1000,如所写)。

当我更改此内容时:

public static final int ITERATIONS = 10000;

对此:

public static final int ITERATIONS = 1000;

您的代码为我正确运行:

$ cuda-memcheck java -cp ".:jcuda-0.7.5b.jar" so1
========= CUDA-MEMCHECK
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
========= ERROR SUMMARY: 0 errors
$

最新更新