numba cuda关于矩阵分解的python代码



我试图实现这段代码来使用我的gpu

import numpy as np
import math
import numba
from numba import vectorize , cuda
@cuda.jit(['float32(float 32)'], device=True)
def CholeskyInc(A):
n,m = A.shape

if n!=m:
print('La matriz no es cuadrada')
else:

L = np.zeros((n,n))

for k in range(n):
L[k,k] = math.sqrt(A[k,k])
for i in range(k+1,n):
if A[i,k] != 0:
L[i,k]=A[i,k]/A[k,k]

for j in range(k+1,n):
for i in range(j,n):
if A[i,j] != 0:
L[i,j] = A[i,j]-A[i,k]*A[j,k]
return L

实际上,我没有使用numba的任何经验,我证明了一些选项,但它们都没有工作。有没有人可以解释一下我如何在GPU上执行这些代码,而不是使用numba和cuda的CPU。

我没有必要的代表在这里只是评论,所以这里有几个建议开始:

  1. 装饰器用法:
    • 使用@cuda.jit作为装饰器可以让你跳过定义函数的参数类型和返回类型。
    • 你只需要调用@cuda.jit(device=True)如果你计划从另一个cuda内核函数调用这个函数。cuda核函数是没有设置device=True的函数。参见此响应末尾的代码示例。
  2. 编译到cuda内核的函数不能有显式返回,也不能打印到控制台。
    • 在返回L的地方,在主机代码中创建L(您在cpu上运行的代码),并将L作为第二个参数传递给您的函数。运行函数后,结果将存储在L中。
    • 在print语句的地方,你必须做一些事情,比如给L分配一个错误值,并在以后使用L之前检查该错误值
import numpy as np
from numba import cuda
from math import sqrt
def main():
""" Creates 2 matrices with randomized values which are then
passed to a cuda kernel where the incomplete Cholesky factorization
is done.
these first several lines set up the input arrays (your A and L)
as well as the thread and block shape that numba will need when
launching your kernel onto the GPU.
"""
np.set_printoptions(linewidth=240)
rand_n = np.random.randint(8 ,11)
A = np.random.random((rand_n ,rand_n)).astype(np.float32)
L = np.zeros((A.shape[0] ,A.shape[0]) ,dtype=A.dtype)
#       tpb meaning "threads per block"
tpb_y = 16
tpb_x = 16
tpb = tpb_x ,tpb_y
#       bpg meaning "blocks per grid"
bpg_y = (A.shape[0] + tpb_y -1 )//tpb_y # ensures enough blocks to cover whole matrix height
bpg_x = (A.shape[1] + tpb_x -1 )//tpb_x # ensures enough blocks to cover whole matrix width
#   Alternatively, you could use the following commented lines
#   which will allocate fewer thread-blocks (less overhead) but requires that
#   we implement striding loops in our kernel/device functions to
#   cover the rest of the matrix data.
# bpg_y = (A.shape[0]//2 + tpb_y-1)//tpb_y # cover half of matrix height, and stride the other half
# bpg_x = (A.shape[1]//2 + tpb_x-1)//tpb_x # cover half of matrix width, and stride the other half
bpg = bpg_x ,bpg_y
# now we launch the kernel
my_cuda_kernel[bpg ,tpb](A ,L)
if np.isnan(L[0 ,0]):
print('La matriz no es cuadrada')
print(f"A:n{A}")
print(f"L:n{L}")
@cuda.jit
def my_cuda_kernel(arr_in :np.ndarray, arr_out :np.ndarray):
# in your kernel, you can set up shared memory
# or implement looping structures to stride through global device memory.
n ,m = arr_in.shape
if n!= m:
# set the first element on the diagonal to a NaN value,
# as a way to signal that no operations were done on L.
arr_out[0, 0] = np.nan  # check by calling np.isnan(L[0,0])
return
# get the current thread's location within the grid
x, y = cuda.grid(2)  # 2 because we launched the kernel specifying only x, and y dimensions
# these x,y coordinates don't explicitly map to coordinates in the array objects
# but we can simplify our indexing by treating them like they do.
x_stride, y_stride = cuda.gridsize(2)
# this loop provides 2 purposes:
#   1. It acts as boundary check to make sure we don't attempt to read
#      memory locations outside of the input/output arrays.
#   2. It also allows our threads to "stride" through the data, and do more
#      work to justify the overhead cost setting this thread-block up.
#           * Note that threads will only be able to "stride"
#             if we don't launch the kernel with full grid coverage.
for _k in range(0, n, x_stride * y_stride):
k = _k + y
# we are telling each thread to stride the
# diagonal and compute the cholesky for arr[:,k] rows and arr[k,:] cols.
CholeskyInc(arr_in, arr_out, n, k)

@cuda.jit(device=True)
def CholeskyInc(A, L, n, k):
L[k, k] = sqrt(A[k, k])
for i in range(k + 1, n):
if A[i, k] != 0:
L[i, k] = A[i, k] / L[k, k]
for j in range(k + 1, n):
for i in range(j, n):
if A[i, j] != 0:
L[i, j] = A[i, j] - A[i, k] * A[j, k]
if __name__ == '__main__':
main()

生成以下输出:

A:
[[0.29083535 0.80408204 0.63088804 0.90458757 0.86371994 0.7966909  0.5818828  0.8885034 ]
[0.24579939 0.8107     0.9785071  0.40308124 0.96477604 0.39282414 0.18642609 0.3129212 ]
[0.18401423 0.11662608 0.3512116  0.97926706 0.4021766  0.23531164 0.81310475 0.93359345]
[0.5243785  0.0469533  0.49699584 0.507422   0.24569689 0.4899143  0.61420184 0.9332651 ]
[0.1070556  0.5214806  0.24065676 0.8860097  0.5074029  0.43745205 0.09919663 0.9222924 ]
[0.17103161 0.25640044 0.94678307 0.26446953 0.9416109  0.8391528  0.69582105 0.5433431 ]
[0.5520146  0.10083573 0.7929039  0.44067022 0.6251738  0.6831893  0.23636419 0.97260725]
[0.47044474 0.13215688 0.5002679  0.72581047 0.8298903  0.55161124 0.6673608  0.5644971 ]]
L:
[[ 0.5392915   0.          0.          0.          0.          0.          0.          0.        ]
[ 0.45578206  0.75028265  0.          0.          0.          0.          0.          0.        ]
[ 0.34121478  0.07139549  0.31735036  0.          0.          0.          0.          0.        ]
[ 0.972347   -0.08193862  0.40050274  0.23244919  0.          0.          0.          0.        ]
[ 0.19851157  0.49516642  0.22095701  0.829872    0.495942    0.          0.          0.        ]
[ 0.3171413   0.21436097  0.9153108   0.17478424  0.923301    0.809901    0.          0.        ]
[ 1.0235922  -0.03484913  0.69132537  0.15120566  0.56607753  0.5887773  -0.06835592  0.        ]
[ 0.8723385   0.01652185  0.4136994   0.47911936  0.7795266   0.47115034  0.4076684   0.34317887]]

最新更新