改进请求结构以提高速度



我创建了一个脚本,从网页中抓取一些元素,然后进入每个列表所附的链接。然后,它会从该网页中获取更多的信息,但抓取速度相对较慢。我得到了大约300/min,我的猜测是我的抓取器的结构,以及它是如何收集请求、跟踪url和抓取信息的。可能是这样吗?我该如何提高速度?

import scrapy
from scrapy.item import Field
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.crawler import CrawlerProcess
from price_parser import Price
def get_price(price_raw):
price_object = Price.fromstring(price_raw)
return price_object.amount_float
def get_currency(price_raw):
price_object = Price.fromstring(price_raw)
currency = price_object.currency
return currency

class VinylItem(scrapy.Item):
title = Field(output_processor = TakeFirst())
label = Field()
media_condition=Field(input_processor = MapCompose(str.strip),
output_processor = TakeFirst())
sleeve_condition = Field(output_processor = TakeFirst())
location = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
price = Field(input_processor = MapCompose(get_price)
,output_processor = TakeFirst())
currency = Field(input_processor = MapCompose(get_currency)
,output_processor  = TakeFirst())
rated = Field(input_processor = MapCompose(str.strip)
,output_processor = Join())
have_vinyl = Field(output_processor = TakeFirst())
want_vinyl = Field(output_processor = TakeFirst())
format = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
released = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
genre = Field(input_processor = MapCompose(str.strip),
output_processor = Join())
style = Field(input_processor = MapCompose(str.strip),
output_processor = Join())

class VinylSpider(scrapy.Spider):
name = 'vinyl'
#allowed_domains = ['x']
start_urls = ['https://www.discogs.com/sell/list?format=Vinyl']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url, callback = self.parse
)
def parse(self, response):
content = response.xpath("//table[@class='table_block mpitems push_down table_responsive']//tbody//tr")
for items in content:
loader = ItemLoader(VinylItem(), selector = items)
loader.add_xpath('title', "(.//strong//a)[position() mod 2=1]//text()")
loader.add_xpath('label', './/p[@class="hide_mobile label_and_cat"]//a//text()')
loader.add_xpath("media_condition", '(.//p[@class="item_condition"]//span)[position() mod 3=0]//text()')
loader.add_xpath("sleeve_condition", './/p[@class="item_condition"]//span[@class="item_sleeve_condition"]//text()')
loader.add_xpath("location", '(.//td[@class="seller_info"]//li)[position() mod 3=0]//text()')
loader.add_xpath('price', '(//tbody//tr//td//span[@class="price"])[position() mod 2=0]//text()')
loader.add_xpath('currency', '(//tbody//tr//td//span[@class="price"])[position() mod 2=0]//text()')
loader.add_xpath('rated', './/td//div[@class="community_rating"]//text()')
loader.add_xpath('have_vinyl', '(.//td//div[@class="community_result"]//span[@class="community_label"])[contains(text(),"have")]//text()')
loader.add_xpath('want_vinyl', '(.//td//div[@class="community_result"]//span[@class="community_label"])[contains(text(),"want")]//text()')

links = items.xpath('.//td[@class="item_description"]//strong//@href').get()
yield response.follow(
response.urljoin(links), 
callback = self.parse_vinyls,
cb_kwargs = {
'loader':loader
}
)            
next_page = response.xpath('(//ul[@class="pagination_page_links"]//a)[last()]//@href').get()
if next_page:
yield response.follow(
response.urljoin(next_page),
callback = self.parse
)
def parse_vinyls(self, response, loader):
#loader = ItemLoader(VinylItem(), selector = response)
loader.add_value('format', response.xpath("(.//div[@id='page_content']//div[5])[1]//text()").get())
loader.add_value('released', response.xpath("(.//div[@id='page_content']//div[9])[1]//text()").get())
loader.add_value('genre', response.xpath("(.//div[@id='page_content']//div[11])[1]//text()").get())
loader.add_value('style', response.xpath("(.//div[@id='page_content']//div[13])[1]//text()").get())
yield loader.load_item()

process = CrawlerProcess(
settings = {
'FEED_URI':'vinyl.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(VinylSpider)
process.start()

根据您提供的代码片段,您的scraper设置得很有效,因为它是yield,一次处理多个请求,这让scrapey可以处理并发性。

有几个设置可以调整以提高刮擦速度。然而,请注意,抓取的第一条规则是,你不应该伤害你正在抓取的网站。请参阅下面可以调整的设置示例。

  1. 增加CONCURRENT_REQUESTS的值。在scratchy中默认为16
  2. 增加CONCURRENT_REQUESTS_PER_DOMAIN的值。在scratchy中默认为8
  3. 增加Twisted IO线程池的最大大小,使DNS解析更快REACTOR_THREADPOOL_MAXSIZE
  4. 降低日志级别LOG_LEVEL = 'INFO'
  5. 如果您不需要cookies,请禁用它们COOKIES_ENABLED = False
  6. 减少下载超时DOWNLOAD_TIMEOUT = 15
  7. 如果你的网速很快,并且你确信你的目标网站足够快,那么就降低DOWNLOAD_DELAY的值不建议这样做

从文档中了解有关这些设置的更多信息

如果以上设置不能解决您的问题,那么您可能需要查看分布式爬网

最新更新