说明
我试图在 GPU 上使用 2D 网格和 2D 块执行矩阵求和,并在多次执行程序后得到不同的结果。对此行为的任何解释或修复都将有所帮助,谢谢。
细节
大多数情况下,CPU 上的结果与 GPU 上的结果一致。但有时(例如,在操作系统启动后(程序会告诉结果不一致。但是之后的所有执行都会产生一致的结果(并且运行速度似乎更快(。我还没有找到一种有保证的方式来重现这种行为。我尝试再次重新启动操作系统,但程序的第一次执行产生了一致的结果。
法典
main 函数对 CPU 和 GPU 上的两个 2^10 x 2^10 矩阵(具有 2^5 x 2^5 网格和 2^5x 2^5 块(执行求和并比较结果。
#include "stdio.h"
#define FALSE 0
#define TRUE !FALSE
double *mallocMatrix(const int row, const int column)
{
return (double*)malloc(row*column*sizeof(double));
}
void matrixInit(double *matrix, const int row, const int column)
{
;
}
int matEqual(double *mat1, double *mat2, const int row, const int column)
{
for(int i=0;i<row;i++)
{
for(int j=0;j<column;j++)
{
int k=i*column+j;
if(mat1[k]!=mat2[k])
{
printf("Entry %d doens't match.n",k);
return FALSE;
}
}
}
return TRUE;
}
void matrixSumCpu(double *m1, double *m2, double *n, const int row, const int column)
{
for(int i=0; i<row; i++)
{
for(int j=0; j<column; j++)
{
int k = i * column + j;
n[k]=m1[k]+m2[k];
}
}
}
__global__ void _2dGrid2dBlockMatSum(double *m1, double *m2, double *n, const int row, const int column)
{
int rowIndex=blockIdx.x*blockDim.x+threadIdx.x;
int columnIndex=blockIdx.y*blockDim.y+threadIdx.y;
if(rowIndex<row&&columnIndex<column)
{
int i=rowIndex*column+columnIndex;//flatten
n[i]=m1[i]+m2[i];
}
}
void checkGpuMalloc(cudaError_t code)
{
if(code != cudaSuccess)
{
exit(-1);
printf("CUDA ERROR occured. ");
}
}
void printMatrix(double *mat, const int row, const int column)
{
const int rowToPrint=3;
const int columnToPrint=6;
for(int i=0;i<rowToPrint;i++)
{
for(int j=0;j<columnToPrint;j++)
printf("%lf", mat[i*column+j]);
if(column>columnToPrint)
printf("...");
printf("n");
}
if(row>rowToPrint)
printf("...n");
}
int main()
{
int row=1<<10, column=1<<10;
double *h_m1=NULL, *h_m2=NULL,*h_n1=NULL, *h_n2=NULL;//n=m1+m2
h_m1=mallocMatrix(row, column);
h_m2=mallocMatrix(row, column);
h_n1=mallocMatrix(row, column);
h_n2=mallocMatrix(row, column);
if(h_m1==NULL||h_m2==NULL||h_n1==NULL||h_n2==NULL)
{
printf("Unable to allocate enough memory on CPUn");
exit(-1);
}
matrixInit(h_m1,row,column);
matrixInit(h_m2,row,column);
printf("Summing matrices on CPU...n");
matrixSumCpu(h_m1,h_m2,h_n1,row,column);
double *d_m1=NULL, *d_m2=NULL, *d_n=NULL;
checkGpuMalloc(cudaMalloc((void**)&d_m1, row*column*sizeof(double)));
checkGpuMalloc(cudaMalloc((void**)&d_m2, row*column*sizeof(double)));
checkGpuMalloc(cudaMalloc((void**)&d_n, row*column*sizeof(double)));
cudaMemcpy(d_m1, h_m1, row*column*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_m2, h_m2, row*column*sizeof(double), cudaMemcpyHostToDevice);
printf("Summing matrices on GPU with 2D grid and 2D blocks.n");
_2dGrid2dBlockMatSum<<<(1<<5,1<<5),(1<<5, 1<<5)>>>(d_m1, d_m2, d_n, row, column);
cudaDeviceSynchronize();
cudaMemcpy(h_n2, d_n, row*column*sizeof(double), cudaMemcpyDeviceToHost);
if(matEqual(h_n1, h_n2, row, column))
printf("Matrices match.n");
else
{
printf("Matrices don't match.nResult on CPU:n");
printMatrix(h_n1, row, column);
printf("Result on GPU:");
printMatrix(h_n2, row, column);
}
free(h_m1);
free(h_m2);
free(h_n1);
free(h_n2);
cudaFree(d_m1);
cudaFree(d_m2);
cudaFree(d_n);
cudaDeviceReset();
return 0;
}
这不会做你认为它做的事情,当我编译你的代码时,编译器会在下面一行发出警告:
_2dGrid2dBlockMatSum<<<(1<<5,1<<5),(1<<5, 1<<5)>>>(d_m1, d_m2, d_n, row, column);
你应该做这样的事情:
_2dGrid2dBlockMatSum<<<dim3(1<<5,1<<5),dim3(1<<5, 1<<5)>>>(d_m1, d_m2, d_n, row, column);
这:
dim3(1<<5,1<<5)
与此不同:
(1<<5,1<<5)
C++编译器计算最后一个表达式,产生某种您意想不到的垃圾(标量数量 32,而不是 2D 数量 (32,32((。
为什么您的matrixInit
函数为空?
如果要强制代码始终失败,请添加一些矩阵初始化:
void matrixInit(double *matrix, const int row, const int column)
{
for (int i = 0; i < row; i++)
for (int j = 0; j < column; j++)
matrix[i*column+j] = 1;
}
并在内核调用之前添加此行:
cudaMemset(d_n, 0, row*column*sizeof(double));
然后编译并运行它,它将失败。
之后,按照我的建议进行dim3
更改,它将修复它。
下面是固定示例:
#include "stdio.h"
#define FALSE 0
#define TRUE !FALSE
double *mallocMatrix(const int row, const int column)
{
return (double*)malloc(row*column*sizeof(double));
}
void matrixInit(double *matrix, const int row, const int column)
{
for (int i = 0; i < row; i++)
for (int j = 0; j < column; j++)
matrix[i*column+j] = 1;
}
int matEqual(double *mat1, double *mat2, const int row, const int column)
{
for(int i=0;i<row;i++)
{
for(int j=0;j<column;j++)
{
int k=i*column+j;
if(mat1[k]!=mat2[k])
{
printf("Entry %d doens't match.n",k);
return FALSE;
}
}
}
return TRUE;
}
void matrixSumCpu(double *m1, double *m2, double *n, const int row, const int column)
{
for(int i=0; i<row; i++)
{
for(int j=0; j<column; j++)
{
int k = i * column + j;
n[k]=m1[k]+m2[k];
}
}
}
__global__ void _2dGrid2dBlockMatSum(double *m1, double *m2, double *n, const int row, const int column)
{
int rowIndex=blockIdx.x*blockDim.x+threadIdx.x;
int columnIndex=blockIdx.y*blockDim.y+threadIdx.y;
if(rowIndex<row&&columnIndex<column)
{
int i=rowIndex*column+columnIndex;//flatten
n[i]=m1[i]+m2[i];
}
}
void checkGpuMalloc(cudaError_t code)
{
if(code != cudaSuccess)
{
exit(-1);
printf("CUDA ERROR occured. ");
}
}
void printMatrix(double *mat, const int row, const int column)
{
const int rowToPrint=3;
const int columnToPrint=6;
for(int i=0;i<rowToPrint;i++)
{
for(int j=0;j<columnToPrint;j++)
printf("%lf", mat[i*column+j]);
if(column>columnToPrint)
printf("...");
printf("n");
}
if(row>rowToPrint)
printf("...n");
}
int main()
{
int row=1<<10, column=1<<10;
double *h_m1=NULL, *h_m2=NULL,*h_n1=NULL, *h_n2=NULL;//n=m1+m2
h_m1=mallocMatrix(row, column);
h_m2=mallocMatrix(row, column);
h_n1=mallocMatrix(row, column);
h_n2=mallocMatrix(row, column);
if(h_m1==NULL||h_m2==NULL||h_n1==NULL||h_n2==NULL)
{
printf("Unable to allocate enough memory on CPUn");
exit(-1);
}
matrixInit(h_m1,row,column);
matrixInit(h_m2,row,column);
printf("Summing matrices on CPU...n");
matrixSumCpu(h_m1,h_m2,h_n1,row,column);
double *d_m1=NULL, *d_m2=NULL, *d_n=NULL;
checkGpuMalloc(cudaMalloc((void**)&d_m1, row*column*sizeof(double)));
checkGpuMalloc(cudaMalloc((void**)&d_m2, row*column*sizeof(double)));
checkGpuMalloc(cudaMalloc((void**)&d_n, row*column*sizeof(double)));
cudaMemcpy(d_m1, h_m1, row*column*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_m2, h_m2, row*column*sizeof(double), cudaMemcpyHostToDevice);
cudaMemset(d_n, 0, row*column*sizeof(double));
printf("Summing matrices on GPU with 2D grid and 2D blocks.n");
printf("%dn", (1<<5,1<<5));
_2dGrid2dBlockMatSum<<<(1<<5,1<<5),(1<<5, 1<<5)>>>(d_m1, d_m2, d_n, row, column);
cudaDeviceSynchronize();
cudaMemcpy(h_n2, d_n, row*column*sizeof(double), cudaMemcpyDeviceToHost);
if(matEqual(h_n1, h_n2, row, column))
printf("Matrices match.n");
else
{
printf("Matrices don't match.nResult on CPU:n");
printMatrix(h_n1, row, column);
printf("Result on GPU:");
printMatrix(h_n2, row, column);
}
free(h_m1);
free(h_m2);
free(h_n1);
free(h_n2);
cudaFree(d_m1);
cudaFree(d_m2);
cudaFree(d_n);
cudaDeviceReset();
return 0;
}