我有下面的文件和代码
import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
logger = logging.getLogger(__name__)
class SpiderOpenCloseLogging:
def __init__(self, item_count):
self.item_count = item_count
self.items_scraped = 0
@classmethod
def from_crawler(cls, crawler):
print('Hey I am called')
# first check if the extension should be enabled and raise
# NotConfigured otherwise
# if not crawler.settings.getbool('MYEXT_ENABLED'):
# raise NotConfigured
# get the number of items from settings
item_count = 1000 #crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)
# instantiate the extension object
ext = cls(crawler.settings,crawler.stats)
# connect the extension object to signals
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
# return the extension object
return ext
def spider_opened(self, spider):
logger.info("opened spider %s", spider.name)
def spider_closed(self, spider):
logger.info("closed spider %s", spider.name)
def item_scraped(self, item, spider):
self.items_scraped += 1
if self.items_scraped % self.item_count == 0:
logger.info("scraped %d items", self.items_scraped)
和我已经改变了设置
MYEXT_ENABLED = True
EXTENSIONS = {
'project.custom_extension.SpiderOpenCloseLogging': 300
}
但是没有信号被调用,我检查了设置中给出的路径,蜘蛛正在被调用
事件我给的打印没有被记录
有人可以建议我错过了什么
感谢所有的信号都是从我对脚本的改编中调用的。你犯了一些错误,这对我来说没有任何意义,因为你没有具体说明任何事情。这就是为什么你没有收到信号,而是收到错误:
几个错误:
1。
def __init__(self, item_count, stats):
self.item_count = item_count
#self.items_scraped = 0 --- change this
self.items_scraped = stats
def item_scraped(self, item, spider):
# self.items_scraped += 1 --- You could do this but then you would not need `crawler.stats`
# if self.items_scraped % self.item_count == 0: --- these should be the other way around
logger.info("scraped %d items", self.items_scraped)
#additional note;
#--- you did not substantiate self.item_count, putting item_count
#in from_crawler does not work. Because you are returning ext, so
#self.item_count takes crawler.settings rather than item_count. So
#you will get an error.
。通过更新,我们有以下更正:
def __init__(self, item_count, stats): # if you want to include crawler.stats
self.item_count = item_count
self.items_scraped = stats
。
def spider_opened(self, spider):
self.items_scraped = self.items_scraped.get_value('item_scraped_count') #use crawler.stats to get item_count
if self.items_scraped is None:
self.items_scraped = 0 #then instantiate with 0
self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000) #get you item count from settings
print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
logger.info("opened spider %s", spider.name)
3 .
def item_scraped(self, item, spider):
logger.info(f"scraped few {self.items_scraped} items")
self.items_scraped += 1
if self.item_count % self.items_scraped == 0: # these have been flipped
logger.info(f"scraped increments {self.items_scraped} items")
放在一起的例子:
import logging
from scrapy import signals
import scrapy
logger = logging.getLogger(__name__)
class SpiderOpenCloseLogging(scrapy.Spider):
name = 'log_signals'
start_urls = [f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11)]
def __init__(self, item_count, stats):
self.item_count = item_count
self.items_scraped = stats
#self.items_scraped = 0
@classmethod
def from_crawler(cls, crawler):
ext = cls(crawler.settings,crawler.stats)
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
return ext
def spider_opened(self, spider):
self.items_scraped = self.items_scraped.get_value('item_scraped_count')
if self.items_scraped is None:
self.items_scraped = 0
self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000)
print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
logger.info("opened spider %s", spider.name)
def spider_closed(self, spider):
logger.info("closed spider %s", spider.name)
def item_scraped(self, item, spider):
logger.info(f"scraped few {self.items_scraped} items")
self.items_scraped += 1
if self.item_count % self.items_scraped == 0:
#print(f"scraped increments {self.items_scraped} items")
logger.info(f"scraped increments {self.items_scraped} items")
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
content = response.xpath('//div[@class = "row"]//div')
for items in content:
yield {
'some_items_links':items.xpath(".//a//@href").get()
}
输出:
.
.
.
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 194 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/author/C-S-Lewis'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 195 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/christianity/page/1/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 196 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
{'some_items_links': '/tag/love/'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 197 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 198 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/author/J-K-Rowling'}
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 199 items
2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped increments 200 items
2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
{'some_items_links': '/tag/truth/page/1/'}
...