自定义项管道中多个文件的CsvItemExporter未导出所有项



我创建了一个项目管道来回答这个问题
应该根据项目中设置的page_no值为每个页面创建一个新文件。这基本上很好
问题出在管道/项目导出器生成的最后一个csv文件page-10.csv
不导出最后10个值,因此文件保持为空。这种行为的原因可能是什么?

管道.py

from scrapy.exporters import CsvItemExporter
class PerFilenameExportPipeline:
"""Distribute items across multiple CSV files according to their 'page_no' field"""
def open_spider(self, spider):
self.filename_to_exporter = {}
def spider_closed(self, spider):
for exporter in self.filename_to_exporter.values():
exporter.finish_exporting()
def _exporter_for_item(self, item):
filename = 'page-' + str(item['page_no'])
del item['page_no']
if filename not in self.filename_to_exporter:
f = open(f'{filename}.csv', 'wb')
exporter = CsvItemExporter(f, export_empty_fields=True)
exporter.start_exporting()
self.filename_to_exporter[filename] = exporter
return self.filename_to_exporter[filename]
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item

蜘蛛

import scrapy
from ..pipelines import PerFilenameExportPipeline

class spidey(scrapy.Spider):
name = "idk"
custom_settings = {
'ITEM_PIPELINES': {
PerFilenameExportPipeline: 100
}
}

def start_requests(self):
yield scrapy.Request("http://quotes.toscrape.com/", cb_kwargs={'page_no': 1})
def parse(self, response, page_no):
for qts in response.xpath("//*[@class="quote"]"):
yield {
'page_no': page_no,
'author' : qts.xpath("./span[2]/small/text()").get(),
'quote' : qts.xpath("./*[@class="text"]/text()").get()
}
next_pg = response.xpath('//li[@class="next"]/a/@href').get()      
if next_pg is not None:
yield response.follow(next_pg, cb_kwargs={'page_no': page_no + 1})

我知道,两年后,但它可能会对某人有所帮助。

看起来你从未关闭过要写入的文件(因为你使用的是内联open(。请将您的代码与Scrapy文档中的代码进行比较("使用项目导出器"部分(:https://docs.scrapy.org/en/latest/topics/exporters.html

此外,该方法现在应该被称为";close_spider";,而不是";spider_ closed";

将代码更改为以下内容应该会有所帮助:

from scrapy.exporters import CsvItemExporter
class PerFilenameExportPipeline:
def open_spider(self, spider):
self.filename_to_exporter = {}
def close_spider(self, spider):
#iterating over exporter-file tuples instead of only exporters
for exporter, csv_file in self.filename_to_exporter.values():
exporter.finish_exporting()
#closing the file
csv_file.close()
def _exporter_for_item(self, item):
filename = 'page-' + str(item['page_no'])
del item['page_no']
if filename not in self.filename_to_exporter:
csv_file = open(f'{filename}.csv', 'wb')
exporter = CsvItemExporter(f, export_empty_fields=True)
exporter.start_exporting()
#adding both exporter & file to later be closed as the dict's value
self.filename_to_exporter[filename] = (exporter, csv_file)
#picking only the exporter via [0]
return self.filename_to_exporter[filename][0]
def process_item(self, item, spider):
exporter = self._exporter_for_item(item)
exporter.export_item(item)
return item

最新更新