执行 CUDA 程序时分段错误



我是 NVIDIA CUDA 编程的新手,在执行使用 CUBLAS 库的程序时遇到"分段错误"。我已经安装了 NVIDIA CUDA Toolkit 6.5。

以下是我的代码:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_cuda.h>
int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    int n2 = 5;
    float *h_T;
    cublasHandle_t handle;
    int dev = findCudaDevice(argc, (const char **)argv);
    if (dev == -1)
    {
        return EXIT_FAILURE;
    }
    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..n");
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization errorn");
        return EXIT_FAILURE;
    }
    printf("Allocating An");
    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));
    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)n");
        return EXIT_FAILURE;
    }
    printf("Allocated An");
    h_B = (float *)malloc(n2 * sizeof(h_B[0]));
    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)n");
        return EXIT_FAILURE;
    }
    printf("Allocated Bn");
    h_C = (float *)malloc(n2 * sizeof(h_C[0]));
    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)n");
        return EXIT_FAILURE;
    }
    printf("Allocated Cn");

    h_T = (float *)malloc(n2 * sizeof(h_T[0]));
    if (h_T == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    int i;
    for (i = 0; i < n2; i++)
    {
        h_A[i] = i;
        h_B[i] = i;
        //h_A[i] = rand() / (float)RAND_MAX;
        //h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = 0;
    }
    printf("Filled A,, B, Cn");
    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)n");
        return EXIT_FAILURE;
    }
    printf("Allocated d_An");
    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)n");
        return EXIT_FAILURE;
    }
    printf("Allocated d_Bn");
    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)n");
        return EXIT_FAILURE;
    }
    printf("Allocated d_Cn");
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)n");
        return EXIT_FAILURE;
    }
    fprintf(stderr, "!!!! error testn");
    printf("Vectors set.n");
    status = cublasGetVector(n2, sizeof(h_T[0]), d_A, 1, h_T, 1);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read T)n");
        return EXIT_FAILURE;
    }
    int f;
    for (f = 0; f < n2; f++)
    {
        printf("T[%d]=%fn", f, h_T[f]);
    }

    status = cublasSdot(handle, n2, d_A, 1, d_B, 1, d_C);
    printf("Dot product done.n");
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error.n");
        return EXIT_FAILURE;
    }

    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)n");
        return EXIT_FAILURE;
    }
    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)n");
        return EXIT_FAILURE;
    }
    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)n");
        return EXIT_FAILURE;
    }
    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)n");
        return EXIT_FAILURE;
    }
    status = cublasDestroy(handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)n");
        return EXIT_FAILURE;
    }

    return 0;
}

该程序在调用"cublasSdot"之前执行良好。我能够通过放置某些印刷语句来推断这一点。

我的程序的输出:

GPU Device 0: "GRID K520" with compute capability 3.0
simpleCUBLAS test running..
Allocating A
Allocated A
Allocated B
Allocated C
Filled A,, B, C
Allocated d_A
Allocated d_B
Allocated d_C
!!!! error test
Vectors set.
T[0]=0.000000
T[1]=1.000000
T[2]=2.000000
T[3]=3.000000
T[4]=4.000000
Segmentation fault

我想我可能会以不正确的方式调用该方法cublasSDot()。请告知我错在哪里。

注意:我参考名为"simpleCUBLAS.cpp"的 CUDA 工具包 6.5 示例文件创建了上述程序。我打印了数组'T的值来测试方法'cublasGetVector'。'cublasSDot' 表示矢量点积 [更多信息]。

我能够通过在cublasCreate()之后添加以下代码来解决处理"分段错误"的问题。

cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);

这将确保 CUBLAS 库知道输入是标量。来源:使用 CUBLAS 例程在 GPGPU 上保留点积

相关内容

  • 没有找到相关文章

最新更新