我正在尝试在CUDA上进行简单的矩阵乘法。我知道可以将数组传递到设备上。但是,我正在使用cudamallocpitch和cudamemcpy2d进行乘法。在执行下面的代码时,我会遇到错误。遇到了非法记忆。当我尝试将结果复制到主机时。非常感谢关于我出错的任何建议。谢谢!
权重 - 优先矩阵,DIM:30x784
输入 - 第二矩阵,dim:784x100
结果_D-在设备(GPU(上结果
结果 - 在主机上复制结果
#include <math.h>
#include <stdio.h>
#include <cstdio>
#include <cstdlib>
#define CUDA_SAFE_CALL(ans)
{ gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "CUDA_SAFE_CALL: %s %s %dn", cudaGetErrorString(code),
file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input, double *weights,
double *results_d, size_t in_pitch,
size_t w1_pitch, size_t result_pitch) {
int row = threadIdx.x;
int col = threadIdx.y;
double value;
double *result_matrix;
result_matrix = ((double *)((char *)results_d + row * result_pitch + col));
printf("%d", threadIdx);
for (int i = 0; i < in_pitch; i++)
{
double *element1 = ((double *)((char *)input + row * in_pitch) + i);
double *element2 = ((double *)((char *)weights + i * w1_pitch) + col);
value = +(*element1) * (*element2);
}
*result_matrix = value;
}
int main() {
static double arr1[30][784];
static double arr2[784][100];
static double result[30][100];
for (int i = 0; i < 30; i++)
{
for (int j = 0; j < 784; j++) arr1[i][j] = 5;
}
for (int i = 0; i < 784; i++) {
for (int j = 0; j < 100; j++) arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch, w1_pitch, result_pitch;
// allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(
cudaMallocPitch((void **)&input, &in_pitch, 100 * sizeof(double), 784));
CUDA_SAFE_CALL(
cudaMallocPitch((void **)&weights, &w1_pitch, 784 * sizeof(double), 30));
CUDA_SAFE_CALL(cudaMallocPitch((void **)&results_d, &result_pitch,
100 * sizeof(double), 30));
// Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input, in_pitch, arr2, 100 * sizeof(double),
100 * sizeof(double), 784,
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights, w1_pitch, arr1, 784 * sizeof(double),
784 * sizeof(double), 30,
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d, result_pitch, result,
100 * sizeof(double), 100 * sizeof(double), 30,
cudaMemcpyHostToDevice));
// using GPU
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(32, 32, 1);
printf("before kernel fucntion");
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights, results_d, in_pitch,
w1_pitch, result_pitch);
printf("after kernel fucntion");
cudaThreadSynchronize();
// copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result, result_pitch, results_d,
100 * sizeof(double), 100 * sizeof(double), 30,
cudaMemcpyDeviceToHost));
// printing and seeing whether the result matrix has been updated
for (int i = 0; i < 100; i++) {
for (int j = 0; j < 30; j++) {
printf("%f", result);
}
printf("n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
此代码中有许多错误。首先,目前尚不清楚在这里进行推销分配会带来任何好处。其次,如果您认真想要快速矩阵倍增性能,则应使用Cublas。
问题:
-
您似乎不了解倾斜的分配。返回的
pitch
值是 bytes 中的值。您不能明智地将其用于矩阵乘法倍数。同样,pitch
值是音高分配的总宽度。它与有效的数据区域不符。为此,您应该使用适当的矩阵维度。 -
您的代码不会在整个矩阵区域上进行矩阵乘法。您仅创建一个32x32线程的单个块,但是您需要足够的块/线程来覆盖整个矩阵区域。这需要更改您的网格尺寸,将矩阵尺寸传递给内核,以及内核中的"线程检查"以防止越野访问。
-
这个用于倾斜访问的构造是不正确的:
result_matrix = ((double*)((char*)results_d + row*result_pitch + col));
它与您的两个输入矩阵的其他构造不匹配,它具有放错位置的近括号。
-
您对两个输入矩阵的感觉逆转。您正在将
input
矩阵索引,就像是weight
矩阵一样,反之亦然。我们需要交换row
,column
和i
的感觉,以使这些匹配实际的矩阵尺寸。 -
您的最终
cudaMemcpy2D
操作的音高值相反:
cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)
^^^^^ ^^^^^
您忘了初始化零循环总和变量:
double value;
我不知道您在这里的意图,它应该是
+=
而不是=+
:value =+ ...
以下代码已经解决了这些问题,并且似乎对我而没有错误:
$ cat t104.cu
#include <stdio.h>
#include <math.h>
#include <cstdio>
#include <cstdlib>
const int d1 = 30;
const int d2 = 784;
const int d3 = 100;
double arr1[d1][d2];
double arr2[d2][d3];
double result[d1][d3];
#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"CUDA_SAFE_CALL: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch, int dim, int rrow, int rcol)
{
int col = threadIdx.x + blockDim.x*blockIdx.x;
int row= threadIdx.y + blockDim.y*blockIdx.y;
if ((row >= rrow) || (col >= rcol)) return;
double value = 0;
double *result_matrix;
result_matrix = ((double*)((char*)results_d + row*result_pitch) + col);
for(int i =0 ; i < dim ; i++)
{
double *element1 = ((double*)((char*)input + i*in_pitch) + col) ;
double *element2 = ((double*)((char*)weights + row*w1_pitch) + i);
value += (*element1) * (*element2);
}
*result_matrix = value;
}
int main()
{
for (int i = 0 ; i < d1; i++)
{
for(int j =0;j <d2 ; j ++)
arr1[i][j] = 5;
}
for (int i =0 ; i < d2; i ++)
{
for(int j=0;j < d3 ; j++)
arr2[i][j] = 3;
}
double *input;
double *weights;
double *results_d;
size_t in_pitch,w1_pitch,result_pitch;
//allocating memory in GPU for 2 inputs and result
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,d3*sizeof(double),d2));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,d2*sizeof(double),d1));
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,d3*sizeof(double),d1));
//Copy matrix from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,d3*sizeof(double),d3*sizeof(double),d2,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,d2*sizeof(double),d2*sizeof(double),d1,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,d3*sizeof(double),d3*sizeof(double),d1,cudaMemcpyHostToDevice));
//using GPU
dim3 dimBlock(32,32,1);
dim3 dimGrid(((d3+dimBlock.x-1)/dimBlock.x),((d1+dimBlock.y-1)/dimBlock.y),1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch, d2, d1, d3);
//copying back to host
CUDA_SAFE_CALL(cudaMemcpy2D(result,d3*sizeof(double),results_d,result_pitch,d3*sizeof(double),d1,cudaMemcpyDeviceToHost));
//printing and seeing whether the result matrix has been updated
for (int i =0 ; i < d3; i ++)
{
for(int j=0;j < d1 ; j++)
{
printf("%f", result[j][i]);
}
printf("n");
}
CUDA_SAFE_CALL(cudaFree(input));
CUDA_SAFE_CALL(cudaFree(weights));
CUDA_SAFE_CALL(cudaFree(results_d));
return 0;
}
$ nvcc -arch=sm_61 -o t104 t104.cu
$