python连接信号未被调用



我有下面的文件和代码

import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
logger = logging.getLogger(__name__)
class SpiderOpenCloseLogging:
def __init__(self, item_count):
self.item_count = item_count
self.items_scraped = 0
@classmethod
def from_crawler(cls, crawler):
print('Hey I am called')
# first check if the extension should be enabled and raise
# NotConfigured otherwise
# if not crawler.settings.getbool('MYEXT_ENABLED'):
#     raise NotConfigured
# get the number of items from settings
item_count = 1000 #crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)
# instantiate the extension object
ext = cls(crawler.settings,crawler.stats)
# connect the extension object to signals
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
# return the extension object
return ext
def spider_opened(self, spider):
logger.info("opened spider %s", spider.name)
def spider_closed(self, spider):
logger.info("closed spider %s", spider.name)
def item_scraped(self, item, spider):
self.items_scraped += 1
if self.items_scraped % self.item_count == 0:
logger.info("scraped %d items", self.items_scraped)

和我已经改变了设置

MYEXT_ENABLED = True 
EXTENSIONS = {
'project.custom_extension.SpiderOpenCloseLogging': 300
}

但是没有信号被调用,我检查了设置中给出的路径,蜘蛛正在被调用

事件我给的打印没有被记录

有人可以建议我错过了什么

感谢

所有的信号都是从我对脚本的改编中调用的。你犯了一些错误,这对我来说没有任何意义,因为你没有具体说明任何事情。这就是为什么你没有收到信号,而是收到错误:

几个错误:

1。

def __init__(self, item_count, stats):
self.item_count = item_count
#self.items_scraped = 0 --- change this
self.items_scraped = stats
  • def item_scraped(self, item, spider):
    # self.items_scraped += 1 --- You could do this but then you would not need `crawler.stats`
    # if self.items_scraped % self.item_count == 0: --- these should be the other way around
    logger.info("scraped %d items", self.items_scraped)
    #additional note;
    #--- you did not substantiate self.item_count, putting item_count
    #in from_crawler does not work. Because you are returning ext, so
    #self.item_count takes crawler.settings rather than item_count. So
    #you will get an error.
    

    。通过更新,我们有以下更正:

    def __init__(self, item_count, stats): # if you want to include crawler.stats
    self.item_count = item_count
    self.items_scraped = stats
    

    def spider_opened(self, spider):
    self.items_scraped = self.items_scraped.get_value('item_scraped_count') #use crawler.stats to get item_count
    if self.items_scraped is None:
    self.items_scraped = 0 #then instantiate with 0
    self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000) #get you item count from settings
    print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
    logger.info("opened spider %s", spider.name)
    

    3 .

    def item_scraped(self, item, spider):
    logger.info(f"scraped few {self.items_scraped} items")
    self.items_scraped += 1
    if  self.item_count % self.items_scraped == 0: # these have been flipped
    logger.info(f"scraped increments {self.items_scraped} items")
    

    放在一起的例子:

    
    import logging
    from scrapy import signals
    import scrapy
    logger = logging.getLogger(__name__)
    class SpiderOpenCloseLogging(scrapy.Spider):
    name = 'log_signals'
    start_urls =  [f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11)]
    def __init__(self, item_count, stats):
    self.item_count = item_count
    self.items_scraped = stats
    #self.items_scraped = 0
    @classmethod
    def from_crawler(cls, crawler):
    ext = cls(crawler.settings,crawler.stats)
    crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
    crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
    crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
    return ext
    def spider_opened(self, spider):
    self.items_scraped = self.items_scraped.get_value('item_scraped_count')
    if self.items_scraped is None:
    self.items_scraped = 0
    self.item_count = self.item_count.getint('MYEXT_ITEMCOUNT', 1000)
    print(f'TEST: {self.items_scraped}, COUNT:{self.item_count}')
    logger.info("opened spider %s", spider.name)
    def spider_closed(self, spider):
    logger.info("closed spider %s", spider.name)
    def item_scraped(self, item, spider):
    logger.info(f"scraped few {self.items_scraped} items")
    self.items_scraped += 1
    if  self.item_count % self.items_scraped == 0:
    #print(f"scraped increments {self.items_scraped} items")
    logger.info(f"scraped increments {self.items_scraped} items")
    
    def start_requests(self):
    for url in self.start_urls:
    yield scrapy.Request(
    url=url,
    callback=self.parse
    )
    def parse(self, response):
    content = response.xpath('//div[@class = "row"]//div')
    for items in content:
    yield {
    'some_items_links':items.xpath(".//a//@href").get()
    }
    

    输出:

    .
    .
    .
    2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 194 items
    2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
    {'some_items_links': '/author/C-S-Lewis'}
    2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 195 items
    2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
    {'some_items_links': '/tag/christianity/page/1/'}
    2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 196 items
    2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/9/>
    {'some_items_links': '/tag/love/'}
    2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 197 items
    2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
    {'some_items_links': '/author/J-K-Rowling'}
    2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 198 items
    2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
    {'some_items_links': '/author/J-K-Rowling'}
    2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped few 199 items
    2022-07-07 02:55:30 [scrapy_exercises.spiders.signals4] INFO: scraped increments 200 items
    2022-07-07 02:55:30 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/page/10/>
    {'some_items_links': '/tag/truth/page/1/'}
    ...
    

    最新更新