>我有一个实现的爬虫,它从文本文件中获取网址并抓取所有网址,然后停止。
我的实现:
class CoreSpider(scrapy.Spider):
name = "final"
custom_settings = {
'ROBOTSTXT_OBEY': 'False',
'HTTPCACHE_ENABLED': 'True',
'LOG_ENABLED': 'False',
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'random_useragent.RandomUserAgentMiddleware': 320
},
}
def __init__(self):
self.all_ngrams = get_ngrams()
# logging.DEBUG(self.all_ngrams)
self.search_term = ""
self.start_urls = self.read_url()
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
.....
.....
我从脚本中运行这个蜘蛛,如下所示:
process = CrawlerProcess(get_project_settings())
process.crawl(CoreSpider)
process.start()
它给出错误" twisted.internet.error.ReactorNotRestartable
一旦它完成抓取所有 URL。
我尝试使用Runner
如下实现,它给出了与以前相同的错误。
runner = CrawlerRunner(get_project_settings())
d = runner.crawl(CoreSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until the crawling is finished
然后我尝试像这样运行蜘蛛:
runner = CrawlerRunner(get_project_settings())
@defer.inlineCallbacks
def crawl():
yield runner.crawl(CoreSpider)
reactor.stop()
crawl()
reactor.run()
但它仍然给出了相同的错误。
一旦所有网址都被抓取,如何手动停止蜘蛛?
更新:Python 2.7 堆栈跟踪
Traceback (most recent call last):
File "seed_list_generator.py", line 768, in <module>
process = CrawlerProcess(get_project_settings())
File "/root/anaconda2/lib/python2.7/site-packages/scrapy/crawler.py", line 243, in __init__
super(CrawlerProcess, self).__init__(settings)
File "/root/anaconda2/lib/python2.7/site-packages/scrapy/crawler.py", line 134, in __init__
self.spider_loader = _get_spider_loader(settings)
File "/root/anaconda2/lib/python2.7/site-packages/scrapy/crawler.py", line 330, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "/root/anaconda2/lib/python2.7/site-packages/scrapy/spiderloader.py", line 61, in from_settings
return cls(settings)
File "/root/anaconda2/lib/python2.7/site-packages/scrapy/spiderloader.py", line 25, in __init__
self._load_all_spiders()
File "/root/anaconda2/lib/python2.7/site-packages/scrapy/spiderloader.py", line 47, in _load_all_spiders
for module in walk_modules(name):
File "/root/anaconda2/lib/python2.7/site-packages/scrapy/utils/misc.py", line 71, in walk_modules
submod = import_module(fullpath)
File "/root/anaconda2/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/root/Public/company_profiler/profiler/spiders/run_spider.py", line 12, in <module>
process.start()
File "/root/anaconda2/lib/python2.7/site-packages/scrapy/crawler.py", line 285, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/root/anaconda2/lib/python2.7/site-packages/twisted/internet/base.py", line 1242, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "/root/anaconda2/lib/python2.7/site-packages/twisted/internet/base.py", line 1222, in startRunning
ReactorBase.startRunning(self)
File "/root/anaconda2/lib/python2.7/site-packages/twisted/internet/base.py", line 730, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
Python 3.6 回溯:
File "seed_list_generator.py", line 769, in <module>
process = CrawlerProcess(get_project_settings())
File "/root/anaconda3/lib/python3.6/site-packages/scrapy/crawler.py", line 249, in __init__
super(CrawlerProcess, self).__init__(settings)
File "/root/anaconda3/lib/python3.6/site-packages/scrapy/crawler.py", line 137, in __init__
self.spider_loader = _get_spider_loader(settings)
File "/root/anaconda3/lib/python3.6/site-packages/scrapy/crawler.py", line 336, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "/root/anaconda3/lib/python3.6/site-packages/scrapy/spiderloader.py", line 61, in from_settings
return cls(settings)
File "/root/anaconda3/lib/python3.6/site-packages/scrapy/spiderloader.py", line 25, in __init__
self._load_all_spiders()
File "/root/anaconda3/lib/python3.6/site-packages/scrapy/spiderloader.py", line 47, in _load_all_spiders
for module in walk_modules(name):
File "/root/anaconda3/lib/python3.6/site-packages/scrapy/utils/misc.py", line 71, in walk_modules
submod = import_module(fullpath)
File "/root/anaconda3/lib/python3.6/importlib/__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 978, in _gcd_import
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 655, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 678, in exec_module
File "<frozen importlib._bootstrap>", line 205, in _call_with_frames_removed
File "/root/Public/company_profiler/profiler/spiders/run_spider.py", line 12, in <module>
process.start()
File "/root/anaconda3/lib/python3.6/site-packages/scrapy/crawler.py", line 291, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/root/anaconda3/lib/python3.6/site-packages/twisted/internet/base.py", line 1242, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "/root/anaconda3/lib/python3.6/site-packages/twisted/internet/base.py", line 1222, in startRunning
ReactorBase.startRunning(self)
File "/root/anaconda3/lib/python3.6/site-packages/twisted/internet/base.py", line 730, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
提前谢谢。
当你像这样更改代码时会发生什么?
class CoreSpider(scrapy.Spider):
name = "final"
custom_settings = {
'ROBOTSTXT_OBEY': 'False',
'HTTPCACHE_ENABLED': 'True',
'LOG_ENABLED': 'False',
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'random_useragent.RandomUserAgentMiddleware': 320
},
}
def __init__(self,*args,**kwargs):
# python 3
super().__init__(*args,**kwargs)
# python 2
# super(CoreSpider, self).__init__(*args, **kwargs)
self.all_ngrams = get_ngrams()
# logging.DEBUG(self.all_ngrams)
self.search_term = ""
self.start_urls = self.read_url()
self.rules = (
Rule(LinkExtractor(unique=True), callback='parse', follow=True, process_request='process_request'),
)
.....
.....
最后,
我设法通过将爬虫放入if __name__ == "__main__"
块中来阻止爬虫。
if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())
process.crawl(CoreSpider)
process.start()
一旦爬网程序完成报废所有网址,它就会优雅地停止爬网程序。