CUDA_SAFE_CALL:遇到了非法内存访问



我正在尝试在CUDA上进行简单的矩阵乘法。我知道可以将数组传递到设备上。但是,我正在使用cudamallocpitch和cudamemcpy2d进行乘法。在执行下面的代码时,我会遇到错误。遇到了非法记忆。当我尝试将结果复制到主机时。非常感谢关于我出错的任何建议。谢谢!

权重 - 优先矩阵,DIM:30x784

输入 - 第二矩阵,dim:784x100

结果_D-在设备(GPU(上结果

结果 - 在主机上复制结果

#include <math.h>
#include <stdio.h>
#include <cstdio>
#include <cstdlib>
#define CUDA_SAFE_CALL(ans) 
  { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line,
                      bool abort = true) {
  if (code != cudaSuccess) {
    fprintf(stderr, "CUDA_SAFE_CALL: %s %s %dn", cudaGetErrorString(code),
            file, line);
    if (abort) exit(code);
  }
}
__global__ void MatrixMulKernel(double *input, double *weights,
                                double *results_d, size_t in_pitch,
                                size_t w1_pitch, size_t result_pitch) {
  int row = threadIdx.x;
  int col = threadIdx.y;
  double value;
  double *result_matrix;
  result_matrix = ((double *)((char *)results_d + row * result_pitch + col));
  printf("%d", threadIdx);
  for (int i = 0; i < in_pitch; i++)
  {
    double *element1 = ((double *)((char *)input + row * in_pitch) + i);
    double *element2 = ((double *)((char *)weights + i * w1_pitch) + col);
    value = +(*element1) * (*element2);
  }
  *result_matrix = value;
}
int main() {
  static double arr1[30][784];
  static double arr2[784][100];
  static double result[30][100];
  for (int i = 0; i < 30; i++)
  {
    for (int j = 0; j < 784; j++) arr1[i][j] = 5;
  }
  for (int i = 0; i < 784; i++) {
    for (int j = 0; j < 100; j++) arr2[i][j] = 3;
  }
  double *input;
  double *weights;
  double *results_d;
  size_t in_pitch, w1_pitch, result_pitch;
  // allocating memory in GPU for 2 inputs and result
  CUDA_SAFE_CALL(
      cudaMallocPitch((void **)&input, &in_pitch, 100 * sizeof(double), 784));
  CUDA_SAFE_CALL(
      cudaMallocPitch((void **)&weights, &w1_pitch, 784 * sizeof(double), 30));
  CUDA_SAFE_CALL(cudaMallocPitch((void **)&results_d, &result_pitch,
                                 100 * sizeof(double), 30));
  // Copy matrix from host to device
  CUDA_SAFE_CALL(cudaMemcpy2D(input, in_pitch, arr2, 100 * sizeof(double),
                              100 * sizeof(double), 784,
                              cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(cudaMemcpy2D(weights, w1_pitch, arr1, 784 * sizeof(double),
                              784 * sizeof(double), 30,
                              cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(cudaMemcpy2D(results_d, result_pitch, result,
                              100 * sizeof(double), 100 * sizeof(double), 30,
                              cudaMemcpyHostToDevice));
  // using GPU
  dim3 dimGrid(1, 1, 1);
  dim3 dimBlock(32, 32, 1);
  printf("before kernel fucntion");
  MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights, results_d, in_pitch,
                                         w1_pitch, result_pitch);
  printf("after kernel fucntion");
  cudaThreadSynchronize();
  // copying back to host
  CUDA_SAFE_CALL(cudaMemcpy2D(result, result_pitch, results_d,
                              100 * sizeof(double), 100 * sizeof(double), 30,
                              cudaMemcpyDeviceToHost));
  // printing and seeing whether the result matrix has been updated
  for (int i = 0; i < 100; i++) {
    for (int j = 0; j < 30; j++) {
      printf("%f", result);
    }
    printf("n");
  }
  CUDA_SAFE_CALL(cudaFree(input));
  CUDA_SAFE_CALL(cudaFree(weights));
  CUDA_SAFE_CALL(cudaFree(results_d));
  return 0;
}

此代码中有许多错误。首先,目前尚不清楚在这里进行推销分配会带来任何好处。其次,如果您认真想要快速矩阵倍增性能,则应使用Cublas。

问题:

  1. 您似乎不了解倾斜的分配。返回的pitch值是 bytes 中的值。您不能明智地将其用于矩阵乘法倍数。同样,pitch值是音高分配的总宽度。它与有效的数据区域不符。为此,您应该使用适当的矩阵维度。

  2. 您的代码不会在整个矩阵区域上进行矩阵乘法。您仅创建一个32x32线程的单个块,但是您需要足够的块/线程来覆盖整个矩阵区域。这需要更改您的网格尺寸,将矩阵尺寸传递给内核,以及内核中的"线程检查"以防止越野访问。

  3. 这个用于倾斜访问的构造是不正确的:

    result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
    

    它与您的两个输入矩阵的其他构造不匹配,它具有放错位置的近括号。

  4. 您对两个输入矩阵的感觉逆转。您正在将input矩阵索引,就像是weight矩阵一样,反之亦然。我们需要交换rowcolumni的感觉,以使这些匹配实际的矩阵尺寸。

  5. 您的最终cudaMemcpy2D操作的音高值相反:

cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)

                    ^^^^^                   ^^^^^
  1. 您忘了初始化零循环总和变量:

    double value;
    
  2. 我不知道您在这里的意图,它应该是+=而不是=+

    value =+ ...
    

以下代码已经解决了这些问题,并且似乎对我而没有错误:

$ cat t104.cu
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
const int d1 = 30;
const int d2 = 784;
const int d3 = 100;
double arr1[d1][d2];
double arr2[d2][d3];
double result[d1][d3];

#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"CUDA_SAFE_CALL: %s %s %dn", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch, int dim, int rrow, int rcol)
{
  int col = threadIdx.x + blockDim.x*blockIdx.x;
  int row=  threadIdx.y + blockDim.y*blockIdx.y;
  if ((row >= rrow) || (col >= rcol)) return;
  double value = 0;
  double *result_matrix;
  result_matrix = ((double*)((char*)results_d + row*result_pitch) + col);
  for(int i =0 ; i < dim ; i++)
  {
    double *element1 = ((double*)((char*)input + i*in_pitch) + col) ;
    double *element2 =   ((double*)((char*)weights + row*w1_pitch) + i);
    value += (*element1) * (*element2);
  }
  *result_matrix = value;
}


int main()
{

  for (int i = 0 ; i < d1; i++)
  {
    for(int j =0;j <d2 ; j ++)
      arr1[i][j] = 5;
  }
  for (int i =0 ; i < d2; i ++)
  {
    for(int j=0;j < d3 ; j++)
      arr2[i][j] = 3;
  }

  double *input;
  double *weights;
  double *results_d;
  size_t in_pitch,w1_pitch,result_pitch;

//allocating memory in GPU for 2 inputs and result
  CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,d3*sizeof(double),d2));
  CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,d2*sizeof(double),d1));
  CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,d3*sizeof(double),d1));
//Copy matrix from host to device
  CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,d3*sizeof(double),d3*sizeof(double),d2,cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,d2*sizeof(double),d2*sizeof(double),d1,cudaMemcpyHostToDevice));
  CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,d3*sizeof(double),d3*sizeof(double),d1,cudaMemcpyHostToDevice));

//using GPU

  dim3 dimBlock(32,32,1);
  dim3 dimGrid(((d3+dimBlock.x-1)/dimBlock.x),((d1+dimBlock.y-1)/dimBlock.y),1);
  MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch, d2, d1, d3);
//copying back to host
  CUDA_SAFE_CALL(cudaMemcpy2D(result,d3*sizeof(double),results_d,result_pitch,d3*sizeof(double),d1,cudaMemcpyDeviceToHost));

//printing and seeing whether the result matrix has been updated
  for (int i =0 ; i < d3; i ++)
  {
    for(int j=0;j < d1 ; j++)
    {
      printf("%f", result[j][i]);
    }
    printf("n");
  }
  CUDA_SAFE_CALL(cudaFree(input));
  CUDA_SAFE_CALL(cudaFree(weights));
  CUDA_SAFE_CALL(cudaFree(results_d));

  return 0;
}
$ nvcc -arch=sm_61 -o t104 t104.cu
$

最新更新