用CUDA求解稠密线性系统AX=B

我可以使用新的cuSOLVER库（CUDA 7）来求解形式的线性系统吗

AX = B

其中A、X和B是NxN密矩阵？

是。

进近编号1

在cuSOLVER的框架中，您可以使用QR分解，请参阅在CUDA中求解线性系统的QR分解。

接近编号2

或者，您可以通过连续调用来计算矩阵逆

 cublas<t>getrfBatched()

计算矩阵的LU分解的

cublas<t>getriBatched()

其从其LU分解开始计算矩阵的逆。

接近编号3

最后一种可能性是使用

cublas<t>getrfBatched()

然后双重调用

cublas<t>trsm()

其求解上或下三角线性系统。

正如Robert Crovella所指出的，答案可能因所涉及矩阵的大小和类型而异。

第1号进近代码

请参阅解决CUDA中线性系统的QR分解。

2号和3号进近代码

下面，我将报告一个实现方法（编号2和3）的工作示例。Hankel矩阵用于向方法提供条件良好的可逆矩阵。请注意，方法nr.3需要根据调用cublas<t>getrfBatched()之后获得的枢轴阵列来排列（重新排列）系统系数向量。这种排列可以在CPU上方便地完成。

#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h>     /* srand, rand */
#include <time.h>       /* time */
#include "cuda_runtime.h" 
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define prec_save 10
#define BLOCKSIZE 256
#define BLOCKSIZEX 16
#define BLOCKSIZEY 16
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "n";
    outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
    T *h_in = (T *)malloc(M * sizeof(T));
    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "n";
    outfile.close();
}
/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {
    double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));
    // --- Initialize random seed
    srand(time(NULL));
    // --- Generate random numbers
    for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();
    // --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];
    free(h_atemp);
}
/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref, 
                               double * __restrict h_y, const int N) {
    for (int k = 0; k < N; k++) h_y[k] = 0.f;
    for (int m = 0; m < N; m++)
        for (int n = 0; n < N; n++)
            h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];
}
/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
    for (int i = 0; i < N; i++) {
        double temp = vec[i];
        vec[i] = vec[pivotArray[i] - 1];
        vec[pivotArray[i] - 1] = temp;
    }   
}
/********/
/* MAIN */
/********/
int main() {
    const unsigned int N = 1000;
    const unsigned int Nmatrices = 1;
    // --- CUBLAS initialization
    cublasHandle_t cublas_handle;
    cublasSafeCall(cublasCreate(&cublas_handle));
    TimingGPU timerLU, timerApproach1, timerApproach2;
    double timingLU, timingApproach1, timingApproach2;
    /***********************/
    /* SETTING THE PROBLEM */
    /***********************/
    // --- Matrices to be inverted (only one in this example)
    double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));
    // --- Setting the Hankel matrix
    setHankelMatrix(h_A, N);
    // --- Defining the solution
    double *h_xref = (double *)malloc(N * sizeof(double));
    for (int k = 0; k < N; k++) h_xref[k] = 1.f;
    // --- Coefficient vectors (only one in this example)
    double *h_y = (double *)malloc(N * sizeof(double));
    computeCoefficientsVector(h_A, h_xref, h_y, N);
    // --- Result (only one in this example)
    double *h_x = (double *)malloc(N * sizeof(double));
    // --- Allocate device space for the input matrices 
    double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
    double *d_y; gpuErrchk(cudaMalloc(&d_y, N *                 sizeof(double)));
    double *d_x; gpuErrchk(cudaMalloc(&d_x, N *                 sizeof(double)));
    // --- Move the relevant matrices from host to device
    gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_y, h_y, N *                 sizeof(double), cudaMemcpyHostToDevice));
    /**********************************/
    /* COMPUTING THE LU DECOMPOSITION */
    /**********************************/
    timerLU.StartCounter();
    // --- Creating the array of pointers needed as input/output to the batched getrf
    double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;
    double **d_inout_pointers;
    gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
    free(h_inout_pointers);
    int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
    int *d_InfoArray;  gpuErrchk(cudaMalloc(&d_InfoArray,      Nmatrices * sizeof(int)));
    int *h_InfoArray  = (int *)malloc(Nmatrices * sizeof(int));
    cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
    //cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));
    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
            fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singularn", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }
    timingLU = timerLU.GetCounter();
    printf("Timing LU decomposition %f [ms]n", timingLU);
    /*********************************/
    /* CHECKING THE LU DECOMPOSITION */
    /*********************************/
    saveCPUrealtxt(h_A,          "D:\Project\solveSquareLinearSystemCUDA\solveSquareLinearSystemCUDA\A.txt", N * N);
    saveCPUrealtxt(h_y,          "D:\Project\solveSquareLinearSystemCUDA\solveSquareLinearSystemCUDA\y.txt", N);
    saveGPUrealtxt(d_A,          "D:\Project\solveSquareLinearSystemCUDA\solveSquareLinearSystemCUDA\Adecomposed.txt", N * N);
    saveGPUrealtxt(d_pivotArray, "D:\Project\solveSquareLinearSystemCUDA\solveSquareLinearSystemCUDA\pivotArray.txt", N);
    /******************************************************************************/
    /* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
    /******************************************************************************/
    timerApproach1.StartCounter();
    // --- Allocate device space for the inverted matrices 
    double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));
    // --- Creating the array of pointers needed as output to the batched getri
    double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));
    double **d_out_pointers;
    gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
    free(h_out_pointers);
    cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));
    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
        fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singularn", i);
        cudaDeviceReset();
        exit(EXIT_FAILURE);
        }
    double alpha1 = 1.f;
    double beta1 = 0.f;
    cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));
    timingApproach1 = timingLU + timerApproach1.GetCounter();
    printf("Timing approach 1 %f [ms]n", timingApproach1);
    /**************************/
    /* CHECKING APPROACH NR.1 */
    /**************************/
    saveGPUrealtxt(d_x, "D:\Project\solveSquareLinearSystemCUDA\solveSquareLinearSystemCUDA\xApproach1.txt", N);
    /*************************************************************/
    /* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
    /*************************************************************/
    timerApproach2.StartCounter();
    double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));
    gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
    gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    rearrange(h_y, h_pivotArray, N);
    gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
    // --- Now P*A=L*U
    //     Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y
    // --- 1st phase - solve Ly = b 
    const double alpha = 1.f;
    // --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result 
    // --- Lower triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));
    // --- Upper triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));
    timingApproach2 = timingLU + timerApproach2.GetCounter();
    printf("Timing approach 2 %f [ms]n", timingApproach2);
    /**************************/
    /* CHECKING APPROACH NR.2 */
    /**************************/
    saveGPUrealtxt(d_y, "D:\Project\solveSquareLinearSystemCUDA\solveSquareLinearSystemCUDA\xApproach2.txt", N);
    return 0;
}

运行这样一个示例所需的Utilities.cu和Utilities.cuh文件都保存在这个github页面上。TimingGPU.cu和TimingGPU.cuh文件在此github页面中进行维护。

关于第三种方法的一些有用参考：

NAG Fortran库例程文档

科学计算软件库（SCSL）用户指南

https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

编辑

进近2号和3号进近的时间（以毫秒为单位）（在GTX960板卡上进行的测试，cc.5.2）。

N        LU decomposition       Approach nr. 2       Approach nr. 3
100      1.08                   2.75                 1.28
500      45.4                   161                  45.7
1000     302                    1053                 303

正如它所出现的，方法nr.3更方便，其成本本质上是计算LU因子分解的成本。此外：

通过LU分解求解线性系统比使用QR分解更快（请参阅CUDA中的QR分解来求解线性系统）
LU分解仅限于平方线性系统，而QR分解在非线性系统的情况下有帮助

以下Matlab代码可用于检查结果

clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt
A = reshape(A, N, N);
yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %fn', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);
load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);
L = eye(N);
for k = 1 : N
    L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
    U(k, k : N) = Adecomposed(k, k : N);
end
load pivotArray.txt
Pj = eye(N);
for j = 1 : N
   tempVector = Pj(j, :);
   Pj(j, :) = Pj(pivotArray(j), :);
   Pj(pivotArray(j), :) = tempVector;
end
fprintf('Percentage rms between Pj * A and L * U in CUDA %fn', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));
xprime      = inv(Lmatlab) * yMatlab;
xMatlab     = inv(Umatlab) * xprime;
fprintf('Percentage rms between reference solution and solution in Matlab %fn', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %fn', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));
load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %fn', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

相关内容

最新更新

热门标签：