我正在将代码从遗留的cblas/lapacke转换为cudaBLAS/cudaSOLVER,但遇到了一些问题。我做了一个测试程序来深入了解这件事。附件是我正在使用的代码:
#include <stdio.h>
#include <assert.h>
#include <string.h>
#ifdef __CUDA
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#else
#include <complex.h>
#include "cblas.h"
#include "lapacke.h"
#endif /* __CUDA */
int main()
{
/* INPUT MATRIX
*
* 0 1-i 0
* 1+i 0 1-i
* 0 1+i 0
*/
int row = 3;
int col = 3;
#ifdef __CUDA
cuDoubleComplex input[9];
input[0] = make_cuDoubleComplex(0.0, 0.0);
input[1] = make_cuDoubleComplex(1.0, -1.0);
input[2] = make_cuDoubleComplex(0.0, 0.0);
input[3] = make_cuDoubleComplex(1.0, 1.0);
input[4] = make_cuDoubleComplex(0.0, 0.0);
input[5] = make_cuDoubleComplex(1.0, -1.0);
input[6] = make_cuDoubleComplex(0.0, 0.0);
input[7] = make_cuDoubleComplex(1.0, 1.0);
input[8] = make_cuDoubleComplex(0.0, 0.0);
cublasHandle_t blasHandle;
cublasCreate(&blasHandle); // initialize CUBLAS context
cusolverDnHandle_t solverHandle;
cusolverStatus_t statusSolver = cusolverDnCreate(&solverHandle);
assert(CUSOLVER_STATUS_SUCCESS == statusSolver);
const cuDoubleComplex cmplx_1 = make_cuDoubleComplex(1.0, 0.0);
const cuDoubleComplex cmplx_0 = make_cuDoubleComplex(0.0, 0.0);
//input host/device matrix
cuDoubleComplex *inputMatrix;
cudaError_t cudaStat = cudaMallocManaged((void**)&inputMatrix, sizeof(cuDoubleComplex)*row*col, cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
//ROW MAJOR TO COL MAJOR
cuDoubleComplex * inputMatrixClone;
cudaMallocManaged((void**)&inputMatrixClone, sizeof(cuDoubleComplex)*row*col, cudaMemAttachGlobal);
memcpy(inputMatrixClone, input, sizeof(cuDoubleComplex)*row*col);
cublasStatus_t status = cublasZgeam(blasHandle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &cmplx_1, inputMatrixClone, row, &cmplx_0, inputMatrixClone, row, inputMatrix, row );
assert(CUBLAS_STATUS_SUCCESS == status);
cudaFree(inputMatrixClone);
// done row major to col major conversion
int lwork = 0;
int *devInfo = NULL;
double* eig;
cuDoubleComplex *d_work = NULL;
cudaStat = cudaMallocManaged((void**)&eig, sizeof(double)*row, cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMallocManaged((void**)&devInfo, sizeof(int), cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
cusolverStatus_t cusolver_status = cusolverDnZheevd_bufferSize(solverHandle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, row, inputMatrix, col, eig, &lwork);
assert (cusolver_status == CUSOLVER_STATUS_SUCCESS);
cudaStat = cudaMallocManaged((void**)&d_work, sizeof(cuDoubleComplex)*lwork, cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
cusolver_status = cusolverDnZheevd(solverHandle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, row, inputMatrix, col, eig, d_work, lwork, devInfo);
cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
FILE *fp;
fp = fopen("with_cuda.txt", "w+");
//eigenvalues
for (int i=0; i<row; i++)
fprintf(fp, "eig[%d] = %0.16fn", i, eig[i]);
//eigenvectors
for(int i=0; i<row*col; i++)
fprintf(fp, "eiv[%d] = %0.16f, %0.16fn", i, cuCreal(inputMatrix[i]), cuCimag(inputMatrix[i]));
fclose(fp);
cudaFree(inputMatrix);
cudaFree(eig);
cudaFree(devInfo);
cudaFree(d_work);
cusolverDnDestroy(solverHandle);
cublasDestroy(blasHandle); // destroy CUBLAS context
cudaDeviceReset();
#else
double complex input[9];
input[0] = 0;
input[1] = 1 - _Complex_I;
input[2] = 0;
input[3] = 1 + _Complex_I;
input[4] = 0;
input[5] = 1 - _Complex_I;
input[6] = 0;
input[7] = 1 + _Complex_I;
input[8] = 0;
double eig[3];
LAPACKE_zheev(LAPACK_ROW_MAJOR, 'V', 'U', row, input, col, eig);
//'V': Compute eigenvalues and eigenvectors. 'U': Upper triangle of input;
FILE *fp;
fp = fopen("without_cuda.txt", "w+");
//eigenvalues
for (int i=0; i<row; i++)
fprintf(fp, "eig[%d] = %0.16fn", i, eig[i]);
//eigenvectors
for(int i=0; i<row*col; i++)
fprintf(fp, "eiv[%d] = %0.16f, %0.16fn", i, creal(input[i]), cimag(input[i]));
fclose(fp);
#endif
return 0;
}
程序可以使用#define_CUDA编译以获得CUDA构建,也可以不使用#define __CUDA编译来获得非CUDA构建。非cuda构建为我提供了以下输出:
eig[0] = -2.0000000000000000
eig[1] = -0.0000000000000002
eig[2] = 2.0000000000000000
eiv[0] = 0.0000000000000001, 0.5000000000000000
eiv[1] = 0.0000000000000002, 0.7071067811865476
eiv[2] = -0.0000000000000001, -0.5000000000000000
eiv[3] = 0.4999999999999998, -0.4999999999999999
eiv[4] = 0.0000000000000000, -0.0000000000000001
eiv[5] = 0.4999999999999998, -0.4999999999999999
eiv[6] = -0.4999999999999999, 0.0000000000000000
eiv[7] = 0.7071067811865475, 0.0000000000000000
eiv[8] = 0.4999999999999999, 0.0000000000000000
cuda构建为我提供了以下输出:
eig[0] = -2.0000000000000000
eig[1] = 0.0000000000000002
eig[2] = 2.0000000000000000
eiv[0] = 0.0000000000000001, 0.5000000000000000
eiv[1] = 0.4999999999999997, -0.4999999999999998
eiv[2] = -0.4999999999999999, 0.0000000000000000
eiv[3] = 0.0000000000000001, 0.7071067811865476
eiv[4] = -0.0000000000000000, 0.0000000000000000
eiv[5] = 0.7071067811865475, 0.0000000000000000
eiv[6] = 0.0000000000000001, 0.5000000000000000
eiv[7] = -0.4999999999999998, 0.4999999999999999
eiv[8] = -0.4999999999999999, 0.0000000000000000
有人能解释一下这个问题吗?为什么我得到的结果主要是特征向量?特征值的顺序似乎也相反。为什么?
特征值在我看来是一样的,从低到高(-2,0和2(。
使用LAPACK(行主(得到的特征向量是:
v_1 = (i/2 , (1-i)/2 , -1/2)
v_2 = (sqrt(2)i/2 , 0 , sqrt(2)/2)
v_3 = (i/2 , (-1+i)/2 , -1/2)
与CUDA(col-major(不同的是,只有最后一个不同,并且有一个全局减号。(i.e., v'_3 = (-i/2 , -(-1+i)/2 , +1/2)
(。这也是矩阵的特征向量,具有相同的特征值(2(。所以,我认为两者都是正确的。