这是我的拼凑结构,省略了具体陈述,保持主体结构可见,将其保存为 test.py。
import scrapy,urllib.request
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 3
AUTOTHROTTLE_MAX_DELAY = 6
CONCURRENT_REQUESTS_PER_IP = 1
CONCURRENT_REQUESTS = 1
CONCURRENT_REQUESTS_PER_SPIDER = 1
CLOSESPIDER_PAGECOUNT = 100000
CLOSESPIDER_TIMEOUT = 36000
DOWNLOAD_DELAY = 3
RETRY_ENABLED = False
COOKIES_ENABLED = False
RETRY_ENABLED = True
RETRY_TIMES = 1
COOKIES_ENABLED = False
class TestSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["some web"]
def __init__(self, *args, **kw):
self.timeout = 10
def start_requests(self):
yield scrapy.Request(url,callback=self.parse)
def parse(self, response):
do something
使用命令 scrapy runspider test.py
运行它时在我的控制台上抽象的信息。
'downloader/request_count': 3391,
'finish_time': datetime.datetime(2017, 10, 25, 12, 29, 43, 101017),
'start_time': datetime.datetime(2017, 10, 25, 12, 24, 10, 63516)}
总时间 = 29min-24min+(43-10)=5min33秒=333秒
总请求数 =3391
我们得出的结论是,它可以像每秒获取 10 个 url 一样快地运行。
为什么DOWNLOAD_DELAY = 3
,AUTOTHROTTLE_ENABLED = True
,AUTOTHROTTLE_START_DELAY = 3
和AUTOTHROTTLE_MAX_DELAY = 6
不能减慢速度?
如何限制下载速度慢到每分钟获取 20 个网址?
请尝试删除冗余设置(例如CONCURRENT_REQUESTS_PER_SPIDER
已被弃用) 试试这个蜘蛛,它的上限是每分钟 20 个请求:
import scrapy
from datetime import datetime
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
custom_settings = {
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 3,
'AUTOTHROTTLE_ENABLED': False,
'RANDOMIZE_DOWNLOAD_DELAY': False
}
def parse(self, response):
self.logger.debug('%s', datetime.utcnow())
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('span small::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page), callback=self.parse)