通过烧瓶访问时在Scrapy中定义规则

我正在用烧瓶和钩针运行scraby spider。在这篇文章中，我使用规则链接提取器来定义规则。在规则中，我正在设置从烧瓶应用程序传递的allow_domains。

spider.py

class myCrawler(CrawlSpider):
name = 'symphony'
base_url=''
start_urls = []
allowed_domains = ''
def __init__(self, category='', **kwargs):
super().__init__(**kwargs)
self.base_url = category
self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
self.start_urls.append(self.base_url)
print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")      
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# Duplicates pipeline
'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
rules = (
Rule(
LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:])),
process_links=process_links,
callback='parse_item',
follow=True
),
)

在这里我给出了LinkExtractor(allow_domains='.'.join(urlparse(self.base_url).netloc.split('.')[-2:]))。但是self没有在那里定义，这引发了一个错误。那么，我如何将表达式'.'.join(urlparse(self.base_url).netloc.split('.')[-2:])的值分配给变量allow_domains[它也与self.allowed_domains]？否则，有什么更好的方法可以实现

这里的问题是CrawlSpider构造函数(init(也在处理规则参数，所以如果我们需要分配它们，我们必须在调用默认构造函数之前完成。

class myCrawler(CrawlSpider):
name = 'symphony'
rotate_user_agent = True
base_url=''
start_urls = []
allowed_domains = ''
def __init__(self, category='', **kwargs):
self.base_url = category
self.allowed_domains = ['.'.join(urlparse(self.base_url).netloc.split('.')[-2:])]
self.start_urls.append(self.base_url)
print(f"Base url is {self.base_url} and allowed domain is {self.allowed_domains}")  
self.rules = (
Rule(
LinkExtractor(allow_domains=self.allowed_domains),
process_links=process_links,
callback='parse_item',
follow=True
),
)   
super().__init__(**kwargs)
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'sitescrapper.sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# Duplicates pipeline
'ITEM_PIPELINES': {'sitescrapper.sitescrapper.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}

相关内容

最新更新

热门标签：