如何在蜘蛛关闭之前存储所有抓取的统计数据?



我想将从爬虫收集的所有统计数据存储到一个以json格式存储的输出文件中。然而,我得到这个错误:

'MemoryStatsCollector'对象没有'get_all'属性

:文档提到stats.get_all是获取所有存储的方式。正确的实现方法是什么?

import scrapy
from scrapy import signals
from scrapy import crawler
import jsonlines
class TestSpider(scrapy.Spider):
name = 'stats'
start_urls = ['http://quotes.toscrape.com']
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
#spider = super(TestSpider, cls).from_crawler(crawler, *args, **kwargs)
stat = cls(crawler.stats)
crawler.signals.connect(stat.spider_closed, signals.spider_closed)
return stat
def spider_closed(self):
#self.stats = stat
txt_file = 'some_text.jl'
with jsonlines.open(txt_file, 'w') as f:
f.write(self.stats.get_all())

def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
content = response.xpath('//div[@class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//@href").get()
}

结果是没有get_all的方法,而不是我不得不输入get_stats(),文档提供了一些例子:

  • stats.get_value ()
  • stats.get_stats ()
  • stats.max_value ()/stats.min_value ()
  • stats.inc_value ()
  • stats.set_value ()

在stats的文档中提供了一些进一步的信息。

工作部分:

def spider_closed(self):
#self.stats = stat
txt_file = 'some_text.jl'
with jsonlines.open(txt_file, 'w') as f:
# f.write(f'{self.stats.get_all()}') --- Changed
f.write(f'{self.stats.get_stats()}')

输出:

{
"log_count/INFO": 10,
"log_count/DEBUG": 3,
"start_time": datetime.datetime(2022, 7, 6, 16, 16, 30, 553373),
"memusage/startup": 59895808,
"memusage/max": 59895808,
"scheduler/enqueued/memory": 1,
"scheduler/enqueued": 1,
"scheduler/dequeued/memory": 1,
"scheduler/dequeued": 1,
"downloader/request_count": 1,
"downloader/request_method_count/GET": 1,
"downloader/request_bytes": 223,
"downloader/response_count": 1,
"downloader/response_status_count/200": 1,
"downloader/response_bytes": 2086,
"httpcompression/response_bytes": 11053,
"httpcompression/response_count": 1,
"response_received_count": 1,
"item_scraped_count": 1,
"elapsed_time_seconds": 0.34008,
"finish_time": datetime.datetime(2022, 7, 6, 16, 16, 30, 893453),
"finish_reason": "finished",
}

最新更新