Fortran 上的 CuSolver 稀疏接口



我正在尝试编写一个程序来将cusolverSp连接到fortran上。 虽然我对用 C 语言编写 cuda 并不陌生,但我不确定如何在 fortran 上获取它。

以下是我的代码:

!  Fortran Console Application 
!
  module cuda_cusolverSP
  interface
 ! cudaMalloc
 integer (c_int) function cudaMalloc ( buffer, size ) bind (C, name="cudaMalloc" ) 
   use iso_c_binding
   implicit none
   type (c_ptr)  :: buffer
   integer (c_size_t), value :: size
 end function cudaMalloc
 ! cudaMemcpy 
 integer (c_int) function cudaMemcpy ( dst, src, count, kind ) bind (C, name="cudaMemcpy" )
   ! note: cudaMemcpyHostToDevice = 1
   ! note: cudaMemcpyDeviceToHost = 2
   use iso_c_binding
   type (C_PTR), value :: dst, src
   integer (c_size_t), value :: count, kind
 end function cudaMemcpy
 ! cudaFree
 integer (c_int) function cudaFree(buffer)  bind(C, name="cudaFree")
   use iso_c_binding
   implicit none
   type (C_PTR), value :: buffer
 end function cudaFree
 integer (c_int) function cudaMemGetInfo(fre, tot) bind(C, name="cudaMemGetInfo")
   use iso_c_binding
   implicit none
   type(c_ptr),value :: fre
   type(c_ptr),value :: tot
 end function cudaMemGetInfo

 integer(c_int) function cusolverSpCreate(cusolver_Hndl) bind(C,name="cusolverSpCreate")
 use iso_c_binding
 implicit none
 type(c_ptr)::cusolver_Hndl
 end function
 integer(c_int) function cusolverSpDestroy(cusolver_Hndl) bind(C,name="cusolverSpDestroy")
 use iso_c_binding
 implicit none
 type(c_ptr),value::cusolver_Hndl
 end function
integer(c_int) function cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,Lwork) bind(C,name="cusolverSpSgetrf_bufferSize") 
  use iso_c_binding
  implicit none
  type(c_ptr),value::cusolver_Hndl
  integer(c_int),value::m
  integer(c_int),value::n
  type(c_ptr),value::d_A
  integer(c_int),value::lda
  type(c_ptr),value::Lwork
end function
 integer(c_int) function cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo) bind(C, name="cusolverSpSgetrf")
   use iso_c_binding
   implicit none
   type(c_ptr),value::cusolver_Hndl
   integer(c_int),value::m
   integer(c_int),value::n
   type(c_ptr),value::d_A
   integer(c_int),value::lda
   type(c_ptr),value::d_WS
   type(c_ptr),value::d_Ipiv
   type(c_ptr),value::d_devInfo
 end function 
 integer (c_int) function cusolverSpSgetrs(cusolver_Hndl,trans,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo) bind(C, name="cusolverSpSgetrs")
   use iso_c_binding
   implicit none
   type(c_ptr),value::cusolver_Hndl
   integer(c_int), value::trans
   integer(c_int), value::n
   integer(c_int), value::nrhs
   type(c_ptr),value::d_A
   integer(c_int), value::lda    
   type(c_ptr),value::d_Ipiv
   type(c_ptr),value::d_B
   integer(c_int),value::ldb
   type(c_ptr),value::d_devInfo
      end function
   end interface  
  end module 
program prog
use iso_c_binding
use cuda_cusolverSP
! ------ Matrix Definition & host CPU storage variables 
integer(c_int) rowsA  ! number of rows of A
integer(c_int) colsA  ! number of columns of A
integer(c_int) nnzA   ! number of nonzeros of A
integer(c_int) baseA  ! base index in CSR format
! CSR(A) from I/O <--- pointers to host CPU memory
type(c_ptr) :: h_csrRowPtrA
type(c_ptr) :: h_csrColIndA(:)
type(c_ptr) :: h_csrValA(:) 
type(c_ptr) :: h_x  ! x = A  b
type(c_ptr) :: h_b  ! b = ones(m,1)
type(c_ptr) :: h_r  ! r = b - A*x
type(c_ptr) :: h_Q  ! <int> n
                    ! reorder to reduce zero fill-in
                    ! Q = symrcm(A) or Q = symamd(A)
!   B = Q*A*Q^T
type(c_ptr) :: h_csrRowPtrB ! <int> n+1
type(c_ptr) :: h_csrColIndB ! <int> nnzA
type(c_ptr) :: h_csrValB    ! <double> nnzA
type(c_ptr) :: h_mapBfromA  ! <int> nnzA

integer size_perm
type(c_ptr) :: buffer_cpu   ! working space for permutation: B = Q*A*Q^T

! -------------------- pointers to device memory    
type(c_ptr) :: d_csrRowPtrA
type(c_ptr) :: d_csrColIndA
type(c_ptr) :: d_csrValA 
type(c_ptr) :: d_x    ! x = A  b
type(c_ptr) :: d_b    ! a copy of h_b
type(c_ptr) :: d_r    ! r = b - A*x
doubleprecision tol
integer reorder
integer singularity 
type(c_ptr)::cpfre,cptot
integer*8,target::free,total
integer res
integer*8 cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice
integer*4 CUBLAS_OP_N, CUBLAS_OP_T
parameter (cudaMemcpyHostToDevice=1)
parameter (cudaMemcpyDeviceToHost=2)
parameter (CUBLAS_OP_N=0)
parameter (CUBLAS_OP_T=1)   

! ==================================================================
rowsA = 0
colsA = 0
nnzA = 0
baseA = 0    

A_size = SIZEOF(rowsA)
B_size = SIZEOF(B)
X_size = SIZEOF(X)
size_perm = 0
tol = 1.e-12
reorder = 0 ! no reordering
singularity = 0 ! -1 if A is invertible under tol.
! Step 1: Create cudense handle ---------------
cusolver_stat = cusolverSpCreate(cusolver_Hndl)  
if (cusolver_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cusolverSpCreate error: ", cusolver_stat
    write (*,*)
    stop
end if
! Step 2: copy A and B to Device
A_mem_stat    = cudaMalloc(d_A,A_size)
if (A_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMalloc 1 error: ", A_mem_stat
    write (*,*)
    stop
end if  
B_mem_stat    = cudaMalloc(d_B,B_size)  
if (B_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMalloc 2 error: ", B_mem_stat
    write (*,*)
    stop
end if    

! ---------- copy A and B to Device
A_mem_stat = cudaMemcpy(d_A,CPU_A_ptr,A_size,cudaMemcpyHostToDevice)
if (A_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMemcpy 1 error: ", A_mem_stat
    write (*,*)
!       stop
   end if
B_mem_stat = cudaMemcpy(d_B,CPU_B_ptr,B_size,cudaMemcpyHostToDevice)
if (B_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMemcpy 2 error: ", B_mem_stat
    write (*,*)
!       stop
    end if
! Step 3: query working space of Sgetrf (and allocate memory on device)
Lwork = 5
cusolver_stat =  cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,CPU_Lwork_ptr) 
if (cusolver_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " SpSgetrf_bufferSize error: ", cusolver_stat
    write (*,*)
!      stop
    end if
write (*,*)
write (*, '(A, I12)') " Lwork: ", Lwork
write (*,*)

Workspace = 4*Lwork
WS_mem_stat = cudaMalloc(d_WS,Workspace)
if (WS_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMalloc 6 error: ", WS_mem_stat
    write (*,*)
!      stop
    end if
! Step 4: compute LU factorization of [A] 
cusolver_stat = cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo) 
if (cusolver_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cusolverSpSgetrf error: ", WS_mem_stat
    write (*,*)
!      stop
    end if
! Step 5: compute solution vector [X] for Right hand side [B]
cusolver_stat = cusolverSpSgetrs(cusolver_Hndl,CUBLAS_OP_N,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo)  
if (cusolver_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cusolverSpSgetrs error: ", WS_mem_stat
    write (*,*)
!      stop
    end if
! Step 6: copy solution vector stored in [B] on device into [X] vector on host
X_mem_stat = cudaMemcpy(CPU_X_ptr,d_B,B_size,cudaMemcpyDeviceToHost)  
if (X_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMemcpy 4 error: ", WS_mem_stat
    write (*,*)
!      stop
    end if
!    do i = 1, n
!        print *, x(i,1)
!    enddo
! step 7: free memory on device and release CPU-side resources
A_mem_Stat    = cudafree(d_A)
B_mem_Stat    = cudafree(d_B)
Ipiv_mem_stat = cudafree(d_Ipiv)
WS_mem_stat   = cudafree(d_WS)
Lwork_mem_stat = cudafree(d_Lwork)
cusolver_stat = cusolverSpDestroy(cusolver_Hndl)
! Step 8: deallocate memory on host before exit
!    deallocate(A)
!   deallocate(ATest)
!    deallocate(B)
!    deallocate(X)
!    deallocate(Ipiv)

end program prog

构建过程中的当前错误是

错误 S0188:参数编号 # 到 cusolverspcreate/etc:类型不匹配

我不知道如何解决它。该程序是对工作cusolverDn的修改,我敢肯定这意味着我犯了很多错误,因为我可以参考的接口样本并不多。

您的主程序中没有implicit none,并且未声明cusolver_Hndl,因此假定它是real

使用 implicit none 并声明所有变量。 cusolver_Hndl应该是type(ptr)的,不要忘记设置其值(如果它不是输出参数,则接口不会显示任何意图(。

相关内容

  • 没有找到相关文章

最新更新