我正在关注一本名为Python Web Scraping - Second Edition的Packt书,参考代码来自这里 https://github.com/kjam/wswp/tree/master/code/chp4而且我的帖子标题中不断出现错误。
该代码确实可以在macOS上运行,从我收集到的内容来看,问题可能出在Windows和Linux之间的fork与spawn差异上。我是多处理的新手,无法弄清楚我需要更改什么以及在哪里才能在 Windows 上正常运行。任何帮助,不胜感激。
我已经尝试了multiprocessing.set_start_method("生成"(但无济于事,基于SO上的一些挖掘。
import multiprocessing
import time
def threaded_crawler_rq(start_url, link_regex, user_agent='wswp', proxies=None,
delay=3, max_depth=4, num_retries=2, cache={}, max_threads=10, scraper_callback=None):
"""
Comments
"""
def mp_threaded_crawler(*args, **kwargs):
""" create a multiprocessing threaded crawler """
processes = []
num_procs = kwargs.pop('num_procs')
if not num_procs:
num_procs = multiprocessing.cpu_count()
for _ in range(num_procs):
proc = multiprocessing.Process(target=threaded_crawler_rq,
args=args, kwargs=kwargs)
proc.start()
processes.append(proc)
# wait for processes to complete
for proc in processes:
proc.join()
if __name__ == '__main__':
from chp4.alexa_callback import AlexaCallback
from chp3.rediscache import RedisCache
import argparse
parser = argparse.ArgumentParser(description='Multiprocessing threaded link crawler')
parser.add_argument('max_threads', type=int, help='maximum number of threads',
nargs='?', default=5)
parser.add_argument('num_procs', type=int, help='number of processes',
nargs='?', default=None)
parser.add_argument('url_pattern', type=str, help='regex pattern for url matching',
nargs='?', default='$^')
par_args = parser.parse_args()
AC = AlexaCallback()
AC()
start_time = time.time()
mp_threaded_crawler(AC.urls, par_args.url_pattern, cache=RedisCache(),
num_procs=par_args.num_procs, max_threads=par_args.max_threads)
print('Total time: %ss' % (time.time() - start_time))
错误:
Traceback (most recent call last):
File "threaded_crawler_with_queue.py", line 49, in <module>
num_procs=par_args.num_procs, max_threads=par_args.max_threads)
File "threaded_crawler_with_queue.py", line 22, in mp_threaded_crawler
proc.start()
File "C:UsersiR9Anaconda3libmultiprocessingprocess.py", line 105, in start
self._popen = self._Popen(self)
File "C:UsersiR9Anaconda3libmultiprocessingcontext.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:UsersiR9Anaconda3libmultiprocessingcontext.py", line 322, in _Popen
return Popen(process_obj)
File "C:UsersiR9Anaconda3libmultiprocessingpopen_spawn_win32.py", line 65, in __init__
reduction.dump(process_obj, to_child)
File "C:UsersiR9Anaconda3libmultiprocessingreduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: can't pickle _thread.lock objects
C:UsersiR9Desktopwswpcode>Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:UsersiR9Anaconda3libmultiprocessingspawn.py", line 99, in spawn_main
new_handle = reduction.steal_handle(parent_pid, pipe_handle)
File "C:UsersiR9Anaconda3libmultiprocessingreduction.py", line 82, in steal_handle
_winapi.PROCESS_DUP_HANDLE, False, source_pid)
OSError: [WinError 87] The parameter is incorrect
至少在 Windows 中,您提供给将在工作进程中运行的函数的参数将通过后台的 IPC 通道传递给该进程。
因此,不能将args
和kwargs
中不可拾取的对象传递给multiprocessing.Process()
。
(venv) C:UsersSashaDocuments>pip install ipdb
<...>
(venv) C:UsersSashaDocuments>ipdb3 t.py
> c:userssashadocumentst.py(1)<module>()
----> 1 import multiprocessing
2 import time
3
ipdb> b 23
Breakpoint 1 at c:userssashadocumentst.py:23
ipdb> c
> c:userssashadocumentst.py(23)mp_threaded_crawler()
22 args=args, kwargs=kwargs)
1--> 23 proc.start()
24 processes.append(proc)
ipdb> p args
(['http://google.com', 'http://youtube.com', 'http://facebook.com', 'http://baidu.com', <...>
ipdb> p kwargs
{'cache': <chp3.rediscache.RedisCache object at 0x000000000560F780>, 'max_threads': 5}
ipdb> import pickle
ipdb> pickle.dumps(kwargs)
*** TypeError: can't pickle _thread.lock objects
对象并非无缘无故地不可拾取。通常,这意味着它们所代表的任何内容(如同步原语或网络连接(本质上都是进程本地的,因此尝试将其传递给另一个进程是没有意义的。