我正试图用共轭梯度法求解Ax=b系统。
我使用的是nvidia示例中的示例,但不是使用cusparseScsrmv函数,而是使用cublasSgemv来执行Ax。
我的问题是cublasSdot函数中的"dot"变量为零,这导致a = r1 / dot
为nan。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <assert.h>
#include <cublas_v2.h>
using namespace std;
int main() {
int Rows = 80 ,Cols = 64;
int NbElements = Rows * Cols;
//allocate host memory
float * A, * x, * rhs;
A = (float *) malloc( NbElements * (Rows + Cols)* sizeof(float) );
x = (float *) malloc( (Rows + Cols) * sizeof(float) );
rhs = (float *)malloc(sizeof(float)* NbElements );
//allocate device memory
float * devA, * devx;
cudaMalloc( &devA, NbElements * (Rows + Cols) * sizeof(float) );
cudaMalloc( &devx, (Rows + Cols) * sizeof(float) );
//read Input
FILE * theFile;
theFile = fopen( "A", "rb" );
assert( NULL != theFile );
assert( NbElements * (Rows + Cols) == fread( A, sizeof(float), NbElements * (Rows + Cols) , theFile ) );
cudaMemcpy(devA, A, NbElements * (Rows + Cols) * sizeof(float),cudaMemcpyHostToDevice);
fclose( theFile );
const float tol = 1e-5f;
const int max_iter = 1000;
float a, b, na, r0, r1,dot;
float *devAx, *devr , *devp;
float alpha, beta, alpham1;
for (int i = 0; i < (Rows + Cols); i++)
x[i] = 0.0;
for (int i = 0; i < NbElements; i++)
rhs[i] = 1.0;
/* Get handle to the CUBLAS context */
cublasHandle_t cublasHandle = 0;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&cublasHandle);
cudaMalloc((void **)&devAx, NbElements * sizeof(float));
cudaMalloc((void **)&devr, NbElements * sizeof(float));
gcudaMalloc((void **)&devp, NbElements * sizeof(float));
cudaMemcpy(devx, x, (Rows + Cols) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devr, rhs, NbElements * sizeof(float), cudaMemcpyHostToDevice);
alpha = 1.0;
alpham1 = -1.0;
beta = 0.0;
r0 = 0.0;
cublasSgemv(cublasHandle , CUBLAS_OP_N , NbElements , (Rows + Cols) , &alpha , devA , NbElements , devx , 1 , &beta , devAx , 1 );
cublasSaxpy(cublasHandle, NbElements , &alpham1, devAx, 1, devr, 1);
cublasStatus = cublasSdot(cublasHandle, NbElements , devr, 1, devr, 1, &r1);
int k = 1;
while (r1 > tol*tol && k <= max_iter)
{
if (k > 1)
{
b = r1 / r0;
cublasStatus = cublasSscal(cublasHandle, NbElements, &b, devp, 1);
cublasStatus = cublasSaxpy(cublasHandle, NbElements , &alpha, devr, 1, devp, 1);
}
else
{
cublasStatus = cublasScopy(cublasHandle, NbElements , devr, 1, devp, 1);
}
cublasSgemv(cublasHandle , CUBLAS_OP_N , NbElements , (Rows + Cols) , &alpha , devA , NbElements , devx , 1 , &beta , devAx , 1 );
cublasStatus = cublasSdot(cublasHandle, NbElements , devp, 1, devAx, 1, &dot);
a = r1 / dot;
cout <<"dot = "<<dot<<endl;
cublasStatus = cublasSaxpy(cublasHandle, NbElements , &a, devp, 1, devx, 1);
na = -a;
cublasStatus = cublasSaxpy(cublasHandle, NbElements , &na, devAx, 1, devr, 1);
r0 = r1;
cublasStatus = cublasSdot(cublasHandle, NbElements , devr, 1, devr, 1, &r1);
cudaThreadSynchronize();
cout << "niteration = "<<k<<" , residual = "<<sqrt(r1)<<endl;
k++;
}
cudaMemcpy(x, devx, (Rows + Cols) *sizeof(float), cudaMemcpyDeviceToHost);
cublasDestroy(cublasHandle);
{ FILE * theFile;
theFile = fopen( "X", "wb" );
assert( NULL != theFile );
fwrite( x, sizeof(float), (Rows + Cols) , theFile );
fclose( theFile );
}
free(rhs);
free( A );
free( x );
// clean up device memory
cudaFree( devA );
cudaFree( devx );
cudaFree( devr );
cudaFree( devp );
cudaFree( devAx );
return 0;
}
(我正在加载一个随机矩阵a,你可以在这里找到(
cuBlasSdot
指令中出现的devAx
数组
cublasStatus = cublasSdot(cublasHandle, NbElements , devp, 1, devAx, 1, &dot);
是一个消失矢量。这是因为devAx
阵列是在线上计算的
cublasSgemv(cublasHandle , CUBLAS_OP_N , NbElements , (Rows + Cols) , &alpha , devA , NbElements , devx , 1 , &beta , devAx , 1 );
作为CCD_ 5和CCD_。反过来,devx
是x
的cudaMemcpy
,其被初始化为消失矢量
for (int i = 0; i < (Rows + Cols); i++)
x[i] = 0.0;
我怀疑代码的其余部分没有多大意义,因为共轭梯度应该也适用于消失的初始猜测。