我试图实现这段代码来使用我的gpu
import numpy as np
import math
import numba
from numba import vectorize , cuda
@cuda.jit(['float32(float 32)'], device=True)
def CholeskyInc(A):
n,m = A.shape
if n!=m:
print('La matriz no es cuadrada')
else:
L = np.zeros((n,n))
for k in range(n):
L[k,k] = math.sqrt(A[k,k])
for i in range(k+1,n):
if A[i,k] != 0:
L[i,k]=A[i,k]/A[k,k]
for j in range(k+1,n):
for i in range(j,n):
if A[i,j] != 0:
L[i,j] = A[i,j]-A[i,k]*A[j,k]
return L
实际上,我没有使用numba的任何经验,我证明了一些选项,但它们都没有工作。有没有人可以解释一下我如何在GPU上执行这些代码,而不是使用numba和cuda的CPU。
我没有必要的代表在这里只是评论,所以这里有几个建议开始:
- 装饰器用法:
- 使用
@cuda.jit
作为装饰器可以让你跳过定义函数的参数类型和返回类型。 - 你只需要调用
@cuda.jit(device=True)
如果你计划从另一个cuda内核函数调用这个函数。cuda核函数是没有设置device=True
的函数。参见此响应末尾的代码示例。
编译到cuda内核的函数不能有显式返回,也不能打印到控制台。 - 使用
- 在返回
L
的地方,在主机代码中创建L(您在cpu上运行的代码),并将L
作为第二个参数传递给您的函数。运行函数后,结果将存储在L
中。 - 在print语句的地方,你必须做一些事情,比如给
L
分配一个错误值,并在以后使用L
之前检查该错误值
import numpy as np
from numba import cuda
from math import sqrt
def main():
""" Creates 2 matrices with randomized values which are then
passed to a cuda kernel where the incomplete Cholesky factorization
is done.
these first several lines set up the input arrays (your A and L)
as well as the thread and block shape that numba will need when
launching your kernel onto the GPU.
"""
np.set_printoptions(linewidth=240)
rand_n = np.random.randint(8 ,11)
A = np.random.random((rand_n ,rand_n)).astype(np.float32)
L = np.zeros((A.shape[0] ,A.shape[0]) ,dtype=A.dtype)
# tpb meaning "threads per block"
tpb_y = 16
tpb_x = 16
tpb = tpb_x ,tpb_y
# bpg meaning "blocks per grid"
bpg_y = (A.shape[0] + tpb_y -1 )//tpb_y # ensures enough blocks to cover whole matrix height
bpg_x = (A.shape[1] + tpb_x -1 )//tpb_x # ensures enough blocks to cover whole matrix width
# Alternatively, you could use the following commented lines
# which will allocate fewer thread-blocks (less overhead) but requires that
# we implement striding loops in our kernel/device functions to
# cover the rest of the matrix data.
# bpg_y = (A.shape[0]//2 + tpb_y-1)//tpb_y # cover half of matrix height, and stride the other half
# bpg_x = (A.shape[1]//2 + tpb_x-1)//tpb_x # cover half of matrix width, and stride the other half
bpg = bpg_x ,bpg_y
# now we launch the kernel
my_cuda_kernel[bpg ,tpb](A ,L)
if np.isnan(L[0 ,0]):
print('La matriz no es cuadrada')
print(f"A:n{A}")
print(f"L:n{L}")
@cuda.jit
def my_cuda_kernel(arr_in :np.ndarray, arr_out :np.ndarray):
# in your kernel, you can set up shared memory
# or implement looping structures to stride through global device memory.
n ,m = arr_in.shape
if n!= m:
# set the first element on the diagonal to a NaN value,
# as a way to signal that no operations were done on L.
arr_out[0, 0] = np.nan # check by calling np.isnan(L[0,0])
return
# get the current thread's location within the grid
x, y = cuda.grid(2) # 2 because we launched the kernel specifying only x, and y dimensions
# these x,y coordinates don't explicitly map to coordinates in the array objects
# but we can simplify our indexing by treating them like they do.
x_stride, y_stride = cuda.gridsize(2)
# this loop provides 2 purposes:
# 1. It acts as boundary check to make sure we don't attempt to read
# memory locations outside of the input/output arrays.
# 2. It also allows our threads to "stride" through the data, and do more
# work to justify the overhead cost setting this thread-block up.
# * Note that threads will only be able to "stride"
# if we don't launch the kernel with full grid coverage.
for _k in range(0, n, x_stride * y_stride):
k = _k + y
# we are telling each thread to stride the
# diagonal and compute the cholesky for arr[:,k] rows and arr[k,:] cols.
CholeskyInc(arr_in, arr_out, n, k)
@cuda.jit(device=True)
def CholeskyInc(A, L, n, k):
L[k, k] = sqrt(A[k, k])
for i in range(k + 1, n):
if A[i, k] != 0:
L[i, k] = A[i, k] / L[k, k]
for j in range(k + 1, n):
for i in range(j, n):
if A[i, j] != 0:
L[i, j] = A[i, j] - A[i, k] * A[j, k]
if __name__ == '__main__':
main()
生成以下输出:
A:
[[0.29083535 0.80408204 0.63088804 0.90458757 0.86371994 0.7966909 0.5818828 0.8885034 ]
[0.24579939 0.8107 0.9785071 0.40308124 0.96477604 0.39282414 0.18642609 0.3129212 ]
[0.18401423 0.11662608 0.3512116 0.97926706 0.4021766 0.23531164 0.81310475 0.93359345]
[0.5243785 0.0469533 0.49699584 0.507422 0.24569689 0.4899143 0.61420184 0.9332651 ]
[0.1070556 0.5214806 0.24065676 0.8860097 0.5074029 0.43745205 0.09919663 0.9222924 ]
[0.17103161 0.25640044 0.94678307 0.26446953 0.9416109 0.8391528 0.69582105 0.5433431 ]
[0.5520146 0.10083573 0.7929039 0.44067022 0.6251738 0.6831893 0.23636419 0.97260725]
[0.47044474 0.13215688 0.5002679 0.72581047 0.8298903 0.55161124 0.6673608 0.5644971 ]]
L:
[[ 0.5392915 0. 0. 0. 0. 0. 0. 0. ]
[ 0.45578206 0.75028265 0. 0. 0. 0. 0. 0. ]
[ 0.34121478 0.07139549 0.31735036 0. 0. 0. 0. 0. ]
[ 0.972347 -0.08193862 0.40050274 0.23244919 0. 0. 0. 0. ]
[ 0.19851157 0.49516642 0.22095701 0.829872 0.495942 0. 0. 0. ]
[ 0.3171413 0.21436097 0.9153108 0.17478424 0.923301 0.809901 0. 0. ]
[ 1.0235922 -0.03484913 0.69132537 0.15120566 0.56607753 0.5887773 -0.06835592 0. ]
[ 0.8723385 0.01652185 0.4136994 0.47911936 0.7795266 0.47115034 0.4076684 0.34317887]]