PyOpenCl简单矩阵乘法



我正在尝试学习PyOpenCl。我遵循各种教程/例子,我在网上找到,我一直试图把一个简单的矩阵乘法。我不明白为什么我不能得到正确的结果:在我看来,循环在我的内核中没有被执行(输出C_flat总是零),或者我可能以错误的方式管理一些内存。谁能给我一些建议吗?非常感谢!

代码如下:

import numpy as np
import pyopencl as cl
import time
def create_input_memory(context, input_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.READ_ONLY, size=array.nbytes))
for array in input_arrays]
def create_output_memory(context, output_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.WRITE_ONLY, size=array.nbytes))
for array in output_arrays]
def matrix_multiply_gpu(A, B):
A_height, A_width = A.shape[0], A.shape[1]
B_height, B_width = B.shape[0], B.shape[1]
C = np.zeros((A_height, B_width))
A_flat = A.flatten()
B_flat = B.flatten()
C_flat = C.flatten()
print(C_flat)
kernel_source = """
kernel void mul(int Wa, int Ha, int Wb, int Hb,
global float *input_a,
global float *input_b,
global float *result){
/* ROW MAJOR notation (I imagine the "GPU matrix") --> no, just model*/
int row = get_global_id(0);
int col = get_global_id(1);
float sum = 0.0f;
for (int i = 0; i < Wa; i++){
sum += input_a[row * Wa + i] * input_b[i * Wb + col];
}
result[row * Wb + col] = sum;
}
"""
platforms = cl.get_platforms()
context = cl.Context(dev_type=cl.device_type.GPU,
properties=[(cl.context_properties.PLATFORM, platforms[0])])
gpu_program_source = cl.Program(context, kernel_source)
gpu_program = gpu_program_source.build()
input_tuples = create_input_memory(context,
(A_flat, B_flat))
output_tuples = create_output_memory(context, (C_flat,)) 
gpu_queue = cl.CommandQueue(context)    

kernel_arguments = [buffer for (_,buffer) in input_tuples]
kernel_arguments += [buffer for (_,buffer) in output_tuples]
gpu_program.mul(gpu_queue, (1024,), (32,), 
np.int32(A_height), np.int32(A_width), np.int32(B_height), 
np.int32(B_width), *kernel_arguments)
for (array, buffer) in output_tuples:
cl.enqueue_copy(gpu_queue, src=buffer, dest=array)

#wait for everyone to finish
gpu_queue.finish()
return C_flat
if __name__=='__main__':
A, B = np.ones((100, 100)), np.ones((100, 100))
C = matrix_multiply_gpu(A, B)
print("n", C, "n")

以下是您的代码中的一些问题:

  1. 你发送高度-宽度对,但你的内核读取宽度-高度。
  2. 你正在发送扁平数组,但是在你的内核中期望一个2 dim数组。
  3. 你没有将数据复制到gpu,没有COPY_HOST_PTR标志。
  4. 当你的代码不工作时,试图用辅助函数自动化它只会模糊逻辑。在代码正常工作之后再做。我不会使用你的create_...函数。
  5. 我发现它更容易复制数组到设备,而不是使缓冲区,这就是为什么我要这样做,但它应该工作相同的方式。
  6. 始终指定要发送的数据的dtype,并确保它与内核期望的相匹配。我花了无数个小时调试,就是因为这个。

代码如下:

import numpy as np
import pyopencl as cl
import pyopencl.array
def matrix_multiply_gpu(A, B):
# ----------------------------this is your code, with minor changes
A_height, A_width = A.shape
B_height, B_width = B.shape
C = np.empty((A_height, B_width), dtype=np.float32)  # some changes
A_flat = A.flatten()
B_flat = B.flatten()
C_flat = C.flatten()
platforms = cl.get_platforms()
context = cl.Context(dev_type=cl.device_type.GPU,
properties=[(cl.context_properties.PLATFORM, platforms[0])])
gpu_queue = cl.CommandQueue(context)    
# ------------------------------------------------------------------
# --------------------------------------This is new or modified code
size = A_height * B_width
kernel_source = """
kernel void mul(int Ha, int Wa, int Hb, int Wb,
global const float *input_a,
global const float *input_b,
global       float *result     ){
int gid = get_global_id(0);
int Arow = gid / Wb;
int Bcol = gid % Wb;
float sum = 0.0f;
for (int i = 0; i < Wa; i++)
sum += input_a[Arow*Wa+i] * input_b[i*Wb+Bcol];
result[gid] = sum;
}
"""
Ad = cl.array.to_device(gpu_queue, A_flat)
Bd = cl.array.to_device(gpu_queue, B_flat)
Cd = cl.array.to_device(gpu_queue, C_flat)
# --------------------------------- and the return is different too

# -------------------------------your code again with minor changes
gpu_program_source = cl.Program(context, kernel_source)
gpu_program = gpu_program_source.build()
gpu_program.mul(gpu_queue, (size,), None,  # some changes here
np.int32(A_height), np.int32(A_width), 
np.int32(B_height), np.int32(B_width), 
Ad.data, Bd.data, Cd.data)
# -----------------------------------------------------------------
return Cd.get().reshape((A_height, B_width)) 
# or send it flattened if you wish, without the reshape
if __name__=='__main__':
from pprint import pprint
A = 2.0*np.ones((5, 3), dtype=np.float32)
B = 3.0*np.ones((3, 4), dtype=np.float32)

C = matrix_multiply_gpu(A, B)
pprint(C)

最新更新