来自cublas函数的零值



我正试图用共轭梯度法求解Ax=b系统。

我使用的是nvidia示例中的示例,但不是使用cusparseScsrmv函数,而是使用cublasSgemv来执行Ax。

我的问题是cublasSdot函数中的"dot"变量为零,这导致a = r1 / dot为nan。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <assert.h>
#include <cublas_v2.h>
using namespace std;

int main() {
    int Rows = 80 ,Cols = 64;
    int NbElements = Rows * Cols;
    //allocate host memory
    float * A, * x, * rhs;
    A = (float *) malloc( NbElements  * (Rows + Cols)* sizeof(float) );
    x = (float *) malloc( (Rows + Cols) * sizeof(float) );
    rhs = (float *)malloc(sizeof(float)*  NbElements );
    //allocate device memory
    float * devA, * devx;
    cudaMalloc( &devA, NbElements * (Rows + Cols) * sizeof(float) );
    cudaMalloc( &devx, (Rows + Cols)  * sizeof(float) );
    //read Input
    FILE * theFile;
    theFile = fopen( "A", "rb" );
    assert( NULL != theFile );
    assert( NbElements * (Rows + Cols) == fread( A, sizeof(float), NbElements * (Rows + Cols) , theFile ) );
    cudaMemcpy(devA, A, NbElements *  (Rows + Cols) * sizeof(float),cudaMemcpyHostToDevice);
    fclose( theFile );

    const float tol = 1e-5f;
    const int max_iter = 1000;
    float a, b, na, r0, r1,dot;
    float *devAx, *devr , *devp;
    float alpha, beta, alpham1;
    for (int i = 0; i < (Rows + Cols); i++)
        x[i] = 0.0;
    for (int i = 0; i < NbElements; i++)
        rhs[i] = 1.0;
    /* Get handle to the CUBLAS context */
    cublasHandle_t cublasHandle = 0;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);
    cudaMalloc((void **)&devAx, NbElements * sizeof(float));
    cudaMalloc((void **)&devr, NbElements * sizeof(float));
    gcudaMalloc((void **)&devp, NbElements * sizeof(float));
    cudaMemcpy(devx, x, (Rows + Cols) * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(devr, rhs, NbElements * sizeof(float), cudaMemcpyHostToDevice);
    alpha = 1.0;
    alpham1 = -1.0;
    beta = 0.0;
    r0 = 0.0;
    cublasSgemv(cublasHandle , CUBLAS_OP_N , NbElements  , (Rows + Cols) , &alpha , devA ,  NbElements  ,  devx , 1 , &beta , devAx , 1 );
    cublasSaxpy(cublasHandle,  NbElements  , &alpham1, devAx, 1, devr, 1);
    cublasStatus = cublasSdot(cublasHandle, NbElements , devr, 1, devr, 1, &r1);
    int k = 1;
    while (r1 > tol*tol && k <= max_iter)
    {
        if (k > 1)
        {
            b = r1 / r0;
            cublasStatus = cublasSscal(cublasHandle, NbElements, &b, devp, 1);
            cublasStatus = cublasSaxpy(cublasHandle,  NbElements , &alpha, devr, 1, devp, 1);
        }
        else
        {
            cublasStatus = cublasScopy(cublasHandle,  NbElements , devr, 1, devp, 1);
        }
        cublasSgemv(cublasHandle , CUBLAS_OP_N , NbElements  , (Rows + Cols) ,  &alpha , devA ,  NbElements  ,  devx , 1 , &beta , devAx , 1 );
        cublasStatus = cublasSdot(cublasHandle,  NbElements , devp, 1, devAx, 1, &dot);
        a = r1 / dot;
        cout <<"dot = "<<dot<<endl;
        cublasStatus = cublasSaxpy(cublasHandle,  NbElements , &a, devp, 1, devx, 1);
        na = -a;
        cublasStatus = cublasSaxpy(cublasHandle,  NbElements , &na, devAx, 1, devr, 1);
        r0 = r1;
        cublasStatus = cublasSdot(cublasHandle,  NbElements , devr, 1, devr, 1, &r1);
        cudaThreadSynchronize();
        cout << "niteration = "<<k<<" , residual = "<<sqrt(r1)<<endl;
        k++;
    }
    cudaMemcpy(x, devx, (Rows + Cols) *sizeof(float), cudaMemcpyDeviceToHost);
    cublasDestroy(cublasHandle);

    {   FILE * theFile;
        theFile = fopen( "X", "wb" );
        assert( NULL != theFile );
        fwrite( x, sizeof(float), (Rows + Cols) , theFile );
        fclose( theFile );
    }
    free(rhs);
    free( A );
    free( x );
    // clean up device memory
    cudaFree( devA );
    cudaFree( devx );
    cudaFree( devr );
    cudaFree( devp );
    cudaFree( devAx );
    return 0;
} 

(我正在加载一个随机矩阵a,你可以在这里找到(

cuBlasSdot指令中出现的devAx数组

cublasStatus = cublasSdot(cublasHandle,  NbElements , devp, 1, devAx, 1, &dot);

是一个消失矢量。这是因为devAx阵列是在线上计算的

cublasSgemv(cublasHandle , CUBLAS_OP_N , NbElements  , (Rows + Cols) , &alpha , devA ,  NbElements  ,  devx , 1 , &beta , devAx , 1 );

作为CCD_ 5和CCD_。反过来,devxxcudaMemcpy,其被初始化为消失矢量

for (int i = 0; i < (Rows + Cols); i++)
    x[i] = 0.0;

我怀疑代码的其余部分没有多大意义,因为共轭梯度应该也适用于消失的初始猜测。

相关内容

  • 没有找到相关文章

最新更新