加速np.与多处理



如果我有一个大小为10^8到10^9的numpy数组,是否有可能比np.sum更快地计算它的和?

我试过使用multiprocessingfork,但它似乎比仅仅调用np.sum慢,不管工人的数量(1-4)。我在带有2 GHz双核英特尔酷睿i5处理器的Mac上使用Python 3.8。我不确定如果我有更多的cpu,结果是否会不同。

我代码:

import concurrent.futures
import multiprocessing as mp
import time
from concurrent.futures.process import ProcessPoolExecutor
import numpy as np
# based on: https://luis-sena.medium.com/sharing-big-numpy-arrays-across-python-processes-abf0dc2a0ab2

def np_sum_global(start, stop):
return np.sum(data[start:stop])

def benchmark():
st = time.time()
ARRAY_SIZE = int(3e8)
print("array size =", ARRAY_SIZE)
global data
data = np.random.random(ARRAY_SIZE)
print("generated", time.time() - st)
print("CPU Count =", mp.cpu_count())
for trial in range(5):
print("TRIAL =", trial)
st = time.time()
s = np.sum(data)
print("method 1", time.time() - st, s)
for NUM_WORKERS in range(1, 5):
st = time.time()
futures = []
with ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
for i in range(0, NUM_WORKERS):
futures.append(
executor.submit(
np_sum_global,
ARRAY_SIZE * i // NUM_WORKERS,
ARRAY_SIZE * (i + 1) // NUM_WORKERS,
)
)
futures, _ = concurrent.futures.wait(futures)
s = sum(future.result() for future in futures)
print("workers =", NUM_WORKERS, time.time() - st, s)
print()

if __name__ == "__main__":
mp.set_start_method("fork")
benchmark()

输出:

array size = 300000000
generated 5.1455769538879395
CPU Count = 4
TRIAL = 0
method 1 0.29593801498413086 150004049.39847052
workers = 1 1.8904719352722168 150004049.39847052
workers = 2 1.2082111835479736 150004049.39847034
workers = 3 1.2650330066680908 150004049.39847082
workers = 4 1.233708143234253 150004049.39847046
TRIAL = 1
method 1 0.5861320495605469 150004049.39847052
workers = 1 1.801928997039795 150004049.39847052
workers = 2 1.165492057800293 150004049.39847034
workers = 3 1.2669389247894287 150004049.39847082
workers = 4 1.2941789627075195 150004049.39847043
TRIAL = 2
method 1 0.44912219047546387 150004049.39847052
workers = 1 1.8038971424102783 150004049.39847052
workers = 2 1.1491520404815674 150004049.39847034
workers = 3 1.3324410915374756 150004049.39847082
workers = 4 1.4198641777038574 150004049.39847046
TRIAL = 3
method 1 0.5163640975952148 150004049.39847052
workers = 1 3.248213052749634 150004049.39847052
workers = 2 2.5148861408233643 150004049.39847034
workers = 3 1.0224149227142334 150004049.39847082
workers = 4 1.20924711227417 150004049.39847046
TRIAL = 4
method 1 1.2363107204437256 150004049.39847052
workers = 1 1.8627309799194336 150004049.39847052
workers = 2 1.233341932296753 150004049.39847034
workers = 3 1.3235111236572266 150004049.39847082
workers = 4 1.344843864440918 150004049.39847046

我看过的一些链接:

  • 如何在python numpy中并行化求和计算?

  • 结合池。Python multiprocessing中映射共享内存数组

下面是使用numba的基准测试。首先,它必须编译代码,这使得第一个程序运行得慢得多。接下来的运行速度大约是numpy的两到三倍。因此,这取决于您运行代码的频率,numba是否值得您使用。

import numba
import numpy as np
import time
# based on: https://luis-sena.medium.com/sharing-big-numpy-arrays-across-python-processes-abf0dc2a0ab2
@numba.jit(nopython=True, parallel=True, cache=True)
def numba_sum(data):
return np.sum(data)
def benchmark():
st = time.time()
ARRAY_SIZE = int(3e8)
print("array size =", ARRAY_SIZE)
global data
data = np.random.random(ARRAY_SIZE)
print("generated", time.time() - st)
for trial in range(5):
print("TRIAL =", trial)
st = time.time()
s = np.sum(data)
print("method 1", time.time() - st, s)
print("TRIAL =", trial)
st = time.time()
s = numba_sum(data)
print("method 2", time.time() - st, s)

if __name__ == "__main__":
benchmark()

最新更新