我想测试cuda 11.1提供的xgels的cuda实现,但似乎无法使其正常工作。例如,这段代码似乎运行得很好:
#include <armadillo>
#include <cusolverDn.h>
int main()
{
const int m = 1000;
const int n = 10;
const int nrhs = 2;
arma::mat A(m, n, arma::fill::randn);
arma::mat B(m, nrhs, arma::fill::randn);
A.col(0).fill(1.0);
B += 10.0;
const arma::mat refX = arma::solve(A, B);
cusolverDnHandle_t handle;
cusolverDnCreate(&handle);
cusolverStatus_t status;
const int lda = m;
const int ldb = std::max(m, n);
size_t l_work = 0;
status = cusolverDnDDgels_bufferSize(
handle,
m, n, nrhs,
NULL, lda,
NULL, ldb,
NULL, ldb,
NULL, &l_work);
std::cout <<"Workspace: " << l_work << "!***n";
//One if ok
std::cout << "Find Workspace - 1 if ok: "<<(status == CUSOLVER_STATUS_SUCCESS)<<"!***n";
double* d_work;
cudaMalloc(reinterpret_cast<void**>(&d_work), l_work);
int* d_info;
cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int));
cudaMemset(d_info, 0, sizeof(int));
double* dA, *dB, *dX;
cudaMalloc(reinterpret_cast<void**>(&dA), A.n_elem* sizeof(double));
cudaMalloc(reinterpret_cast<void**>(&dB), B.n_elem* sizeof(double));
cudaMemcpy(dA,A.memptr(), A.n_elem * sizeof(double),cudaMemcpyHostToDevice);
cudaMemcpy(dB, B.memptr(), B.n_elem * sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc(reinterpret_cast<void**>(&dX), refX.n_elem * sizeof(double));
cudaMemset(dX, 0, refX.n_elem * sizeof(double));
int iter = 0;
status = cusolverDnDDgels(handle,
m, n, nrhs,
dA, lda,
dB, ldb,
dX, ldb,
d_work, l_work,
&iter, d_info);
//One if ok
std::cout << "Solve status - 1 if ok: " << (status == CUSOLVER_STATUS_SUCCESS) << "!***n";
int h_info = -1;
cudaMemcpy(&h_info, d_info,sizeof(int),cudaMemcpyDeviceToHost);
std::cout << "Iter: " <<iter << "!n";
//0 if ok
std::cout << "Info - 0 if ok :" << h_info << "!n";
//Comparison of the results results
arma::mat cudaX(refX.n_rows, refX.n_cols);
cudaMemcpy(cudaX.memptr(), dX, cudaX.n_elem * sizeof(double), cudaMemcpyDeviceToHost);
std::cout << "Armadillo result:n" <<refX.t() <<"n";
std::cout << "cusolver result:n" << cudaX.t() << "n";
cudaFree(dA);
cudaFree(dB);
cudaFree(dX);
cudaFree(d_work);
cudaFree(d_info);
}
不幸的是,结果是错误的,因为只有第一列看起来还可以:
工作区:3653888!***
查找工作区-1(如果可以(:1!***
解决状态-1(如果可以(:1!***
Iter:-51!
信息-如果可以,则为0:0!
Armadillo结果:
9.9965-0.0198 0.0290-0.0317 0.0027-0.0197 0.0377-0.0379-0.0172 0.0088
9.9774 0.0485 0.0089-0.0233 0.0054-0.0257 0.0130 0.0080 0.0149-0.0335
cusolver结果:
9.9965-0.0198 0.0290-0.0317 0.0027-0.0197 0.0377-0.0379-0.0172 0.0088
-0.8578 0.1884 0.5331-0.8275 0.1992-0.0587 1.0014-0.0250 0.6571-0.5516
如果我运行cuda memcheck,我得到的第一个错误是:
========= CUDA-MEMCHECK
========= Invalid __global__ write of size 8
========= at 0x00001aa0 in void copy_AtoB_kernel<double>(int, int, double const *, int, double*, int)
========= by thread (31,0,0) in block (15,0,0)
========= Address 0xb00e9b2f8 is out of bounds
========= Device Frame:void copy_AtoB_kernel<double>(int, int, double const *, int, double*, int) (void copy_AtoB_kernel<double>(int, int, double const *, int, double*, int) : 0x1aa0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll [0x751f4]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll [0x75577]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll [0x79cd9]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll (cuProfilerStop + 0x11ce4a) [0x32e5ba]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll [0x16cfe5]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll (cuProfilerStop + 0xf1052) [0x3027c2]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll [0x3841d]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll [0x3890c]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll [0x38be4]
========= Host Frame:C:WINDOWSsystem32DriverStoreFileRepositorynv_dispi.inf_amd64_8e1b465b962975a0nvcuda64.dll (cuLaunchKernel + 0x234) [0x201044]
========= Host Frame:C:Program FilesNVIDIA GPU Computing ToolkitCUDAv11.1bincusolver64_11.dll [0x4856]
========= Host Frame:C:Program FilesNVIDIA GPU Computing ToolkitCUDAv11.1bincusolver64_11.dll [0x22b4]
========= Host Frame:C:Program FilesNVIDIA GPU Computing ToolkitCUDAv11.1bincusolver64_11.dll (cusolverDnIRSParamsSetTolInner + 0x2269) [0xda299]
========= Host Frame:C:Program FilesNVIDIA GPU Computing ToolkitCUDAv11.1bincusolver64_11.dll (cusolverDnIRSParamsSetTolInner + 0x29f1) [0xdaa21]
========= Host Frame:C:Program FilesNVIDIA GPU Computing ToolkitCUDAv11.1bincusolver64_11.dll (cusolverDnIRSParamsSetTolInner + 0xea66) [0xe6a96]
========= Host Frame:C:Program FilesNVIDIA GPU Computing ToolkitCUDAv11.1bincusolver64_11.dll (cusolverDnZZgesv_bufferSize + 0x5b61) [0x103d41]
========= Host Frame:C:Program FilesNVIDIA GPU Computing ToolkitCUDAv11.1bincusolver64_11.dll (cusolverDnIRSXgels + 0x4b6) [0x109cb6]
========= Host Frame:C:Program FilesNVIDIA GPU Computing ToolkitCUDAv11.1bincusolver64_11.dll (cusolverDnZZgesv_bufferSize + 0x11e4) [0xff3c4]
========= Host Frame:C:sw_sourceSEM.MaevebuildReleasePerformanceTest.exe (main + 0x36c) [0x292d08c]
========= Host Frame:C:sw_sourceSEM.MaevebuildReleasePerformanceTest.exe (__scrt_common_main_seh + 0x10c) [0x2ce4378]
========= Host Frame:C:WINDOWSSystem32KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17c24]
========= Host Frame:C:WINDOWSSYSTEM32ntdll.dll (RtlUserThreadStart + 0x21) [0x6d4d1]
因此,代码似乎有内存错误,但我在使用中无法检测到任何错误。此外,如果我注释掉对cusolverDenDgels的调用,错误就会消失。
有没有线索表明是什么导致了这个代码的失败?
我在rtx 2080TI上执行代码,代码是用编译的
> nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Tue_Sep_15_19:12:04_Pacific_Daylight_Time_2020
Cuda compilation tools, release 11.1, V11.1.74
Build cuda_11.1.relgpu_drvr455TC455_06.29069683_0
在VisualStudio for Windows中,使用as选项:compute_70,sm_70;compute_75,sm_75;
根据我的测试,如果你:
-
更新到CUDA 11.1更新1(以便
nvcc --version
报告11.1.105
( -
将
lddx
参数更改为等于n
:status = cusolverDnDDgels(handle, m, n, nrhs, dA, lda, dB, ldb, dX, n, //change here and in the buffersize function from ldb to n ...
然后我得到了cusolver和armadillo之间的匹配结果。