我正在尝试编写一个程序来将cusolverSp连接到fortran上。 虽然我对用 C 语言编写 cuda 并不陌生,但我不确定如何在 fortran 上获取它。
以下是我的代码:
! Fortran Console Application
!
module cuda_cusolverSP
interface
! cudaMalloc
integer (c_int) function cudaMalloc ( buffer, size ) bind (C, name="cudaMalloc" )
use iso_c_binding
implicit none
type (c_ptr) :: buffer
integer (c_size_t), value :: size
end function cudaMalloc
! cudaMemcpy
integer (c_int) function cudaMemcpy ( dst, src, count, kind ) bind (C, name="cudaMemcpy" )
! note: cudaMemcpyHostToDevice = 1
! note: cudaMemcpyDeviceToHost = 2
use iso_c_binding
type (C_PTR), value :: dst, src
integer (c_size_t), value :: count, kind
end function cudaMemcpy
! cudaFree
integer (c_int) function cudaFree(buffer) bind(C, name="cudaFree")
use iso_c_binding
implicit none
type (C_PTR), value :: buffer
end function cudaFree
integer (c_int) function cudaMemGetInfo(fre, tot) bind(C, name="cudaMemGetInfo")
use iso_c_binding
implicit none
type(c_ptr),value :: fre
type(c_ptr),value :: tot
end function cudaMemGetInfo
integer(c_int) function cusolverSpCreate(cusolver_Hndl) bind(C,name="cusolverSpCreate")
use iso_c_binding
implicit none
type(c_ptr)::cusolver_Hndl
end function
integer(c_int) function cusolverSpDestroy(cusolver_Hndl) bind(C,name="cusolverSpDestroy")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
end function
integer(c_int) function cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,Lwork) bind(C,name="cusolverSpSgetrf_bufferSize")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
integer(c_int),value::m
integer(c_int),value::n
type(c_ptr),value::d_A
integer(c_int),value::lda
type(c_ptr),value::Lwork
end function
integer(c_int) function cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo) bind(C, name="cusolverSpSgetrf")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
integer(c_int),value::m
integer(c_int),value::n
type(c_ptr),value::d_A
integer(c_int),value::lda
type(c_ptr),value::d_WS
type(c_ptr),value::d_Ipiv
type(c_ptr),value::d_devInfo
end function
integer (c_int) function cusolverSpSgetrs(cusolver_Hndl,trans,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo) bind(C, name="cusolverSpSgetrs")
use iso_c_binding
implicit none
type(c_ptr),value::cusolver_Hndl
integer(c_int), value::trans
integer(c_int), value::n
integer(c_int), value::nrhs
type(c_ptr),value::d_A
integer(c_int), value::lda
type(c_ptr),value::d_Ipiv
type(c_ptr),value::d_B
integer(c_int),value::ldb
type(c_ptr),value::d_devInfo
end function
end interface
end module
program prog
use iso_c_binding
use cuda_cusolverSP
! ------ Matrix Definition & host CPU storage variables
integer(c_int) rowsA ! number of rows of A
integer(c_int) colsA ! number of columns of A
integer(c_int) nnzA ! number of nonzeros of A
integer(c_int) baseA ! base index in CSR format
! CSR(A) from I/O <--- pointers to host CPU memory
type(c_ptr) :: h_csrRowPtrA
type(c_ptr) :: h_csrColIndA(:)
type(c_ptr) :: h_csrValA(:)
type(c_ptr) :: h_x ! x = A b
type(c_ptr) :: h_b ! b = ones(m,1)
type(c_ptr) :: h_r ! r = b - A*x
type(c_ptr) :: h_Q ! <int> n
! reorder to reduce zero fill-in
! Q = symrcm(A) or Q = symamd(A)
! B = Q*A*Q^T
type(c_ptr) :: h_csrRowPtrB ! <int> n+1
type(c_ptr) :: h_csrColIndB ! <int> nnzA
type(c_ptr) :: h_csrValB ! <double> nnzA
type(c_ptr) :: h_mapBfromA ! <int> nnzA
integer size_perm
type(c_ptr) :: buffer_cpu ! working space for permutation: B = Q*A*Q^T
! -------------------- pointers to device memory
type(c_ptr) :: d_csrRowPtrA
type(c_ptr) :: d_csrColIndA
type(c_ptr) :: d_csrValA
type(c_ptr) :: d_x ! x = A b
type(c_ptr) :: d_b ! a copy of h_b
type(c_ptr) :: d_r ! r = b - A*x
doubleprecision tol
integer reorder
integer singularity
type(c_ptr)::cpfre,cptot
integer*8,target::free,total
integer res
integer*8 cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice
integer*4 CUBLAS_OP_N, CUBLAS_OP_T
parameter (cudaMemcpyHostToDevice=1)
parameter (cudaMemcpyDeviceToHost=2)
parameter (CUBLAS_OP_N=0)
parameter (CUBLAS_OP_T=1)
! ==================================================================
rowsA = 0
colsA = 0
nnzA = 0
baseA = 0
A_size = SIZEOF(rowsA)
B_size = SIZEOF(B)
X_size = SIZEOF(X)
size_perm = 0
tol = 1.e-12
reorder = 0 ! no reordering
singularity = 0 ! -1 if A is invertible under tol.
! Step 1: Create cudense handle ---------------
cusolver_stat = cusolverSpCreate(cusolver_Hndl)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cusolverSpCreate error: ", cusolver_stat
write (*,*)
stop
end if
! Step 2: copy A and B to Device
A_mem_stat = cudaMalloc(d_A,A_size)
if (A_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMalloc 1 error: ", A_mem_stat
write (*,*)
stop
end if
B_mem_stat = cudaMalloc(d_B,B_size)
if (B_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMalloc 2 error: ", B_mem_stat
write (*,*)
stop
end if
! ---------- copy A and B to Device
A_mem_stat = cudaMemcpy(d_A,CPU_A_ptr,A_size,cudaMemcpyHostToDevice)
if (A_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMemcpy 1 error: ", A_mem_stat
write (*,*)
! stop
end if
B_mem_stat = cudaMemcpy(d_B,CPU_B_ptr,B_size,cudaMemcpyHostToDevice)
if (B_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMemcpy 2 error: ", B_mem_stat
write (*,*)
! stop
end if
! Step 3: query working space of Sgetrf (and allocate memory on device)
Lwork = 5
cusolver_stat = cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,CPU_Lwork_ptr)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " SpSgetrf_bufferSize error: ", cusolver_stat
write (*,*)
! stop
end if
write (*,*)
write (*, '(A, I12)') " Lwork: ", Lwork
write (*,*)
Workspace = 4*Lwork
WS_mem_stat = cudaMalloc(d_WS,Workspace)
if (WS_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMalloc 6 error: ", WS_mem_stat
write (*,*)
! stop
end if
! Step 4: compute LU factorization of [A]
cusolver_stat = cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cusolverSpSgetrf error: ", WS_mem_stat
write (*,*)
! stop
end if
! Step 5: compute solution vector [X] for Right hand side [B]
cusolver_stat = cusolverSpSgetrs(cusolver_Hndl,CUBLAS_OP_N,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo)
if (cusolver_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cusolverSpSgetrs error: ", WS_mem_stat
write (*,*)
! stop
end if
! Step 6: copy solution vector stored in [B] on device into [X] vector on host
X_mem_stat = cudaMemcpy(CPU_X_ptr,d_B,B_size,cudaMemcpyDeviceToHost)
if (X_mem_stat .ne. 0 ) then
write (*,*)
write (*, '(A, I2)') " cudaMemcpy 4 error: ", WS_mem_stat
write (*,*)
! stop
end if
! do i = 1, n
! print *, x(i,1)
! enddo
! step 7: free memory on device and release CPU-side resources
A_mem_Stat = cudafree(d_A)
B_mem_Stat = cudafree(d_B)
Ipiv_mem_stat = cudafree(d_Ipiv)
WS_mem_stat = cudafree(d_WS)
Lwork_mem_stat = cudafree(d_Lwork)
cusolver_stat = cusolverSpDestroy(cusolver_Hndl)
! Step 8: deallocate memory on host before exit
! deallocate(A)
! deallocate(ATest)
! deallocate(B)
! deallocate(X)
! deallocate(Ipiv)
end program prog
构建过程中的当前错误是
错误 S0188:参数编号 # 到 cusolverspcreate/etc:类型不匹配
我不知道如何解决它。该程序是对工作cusolverDn的修改,我敢肯定这意味着我犯了很多错误,因为我可以参考的接口样本并不多。
您的主程序中没有implicit none
,并且未声明cusolver_Hndl
,因此假定它是real
。
使用 implicit none
并声明所有变量。 cusolver_Hndl
应该是type(ptr)
的,不要忘记设置其值(如果它不是输出参数,则接口不会显示任何意图(。