加速抓取的 python3 脚本



我想使用以下脚本从尼加拉瓜国民议会的这个网站批量下载免费下载的pdf(1843年至1900年名为Gaceta的旧报纸的副本(,Python3/Scrapy(请参阅此处的前一个问题(:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# A scrapy script to download issues of the Gaceta de Nicaragua (1843-1961)
# virtualenv -p python3 envname
# source envname/bin/activate
# scrapy runspider gaceta_downloader.py
import errno
import json
import os
import scrapy
from scrapy import FormRequest, Request
pwd="/Downloads"
os.chdir(pwd) # this will change directory to pwd path.
print((os.getcwd()))
class AsambleaSpider(scrapy.Spider):
name = 'asamblea'
allowed_domains = ['asamblea.gob.ni']
start_urls = ['http://digesto.asamblea.gob.ni/consultas/coleccion/']
papers = {
"Diario Oficial": "28",
}
def parse(self, response):
for key, value in list(self.papers.items()):
yield FormRequest(url='http://digesto.asamblea.gob.ni/consultas/util/ws/proxy.php',
headers= {
'X-Requested-With': 'XMLHttpRequest'
}, formdata= {
'hddQueryType': 'initgetRdds',
'cole': value
}
, meta={'paper': key},
callback=self.parse_rdds
)
pass
def parse_rdds(self, response):
data = json.loads(response.body_as_unicode())
for r in data["rdds"]:
r['paper'] = response.meta['paper']
rddid = r['rddid']
yield Request("http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=" + rddid,
callback=self.download_pdf, meta=r)
def download_pdf(self, response):
filename = "{paper}/{anio}/".format(**response.meta) + "{titulo}-{fecPublica}.pdf".format(**response.meta).replace("/", "_")
if not os.path.exists(os.path.dirname(filename)):
try:
os.makedirs(os.path.dirname(filename))
except OSError as exc:  # guard against race condition
if exc.errno != errno.EEXIST:
raise
with open(filename, 'wb') as f:
f.write(response.body)

该脚本完成了从php文件中获取直接链接并随后下载 PDF 的工作,但是仍有两件事困扰着我:

  1. 我希望能够设置我想下载的 Gacetas 的时间范围,即 01/01/1844 到 01/01/1900 之间的所有问题(可用(。我试图自己弄清楚,但无济于事,因为我是一个编程新手。
  2. 我想加速脚本。也许有xargs?至于现在,即使我没有测量过,执行也感觉很慢。

免责声明:我没有测试脚本,因为 scrapy 需要 Visual C++ 14.0 Microsoft并且下载和安装需要一段时间:(

这是一个更新的脚本,我将日期范围添加为startend,并修改了parse_rdds方法以仅在时间范围内下载文件。

至于优化它,scrapy 是一个非阻塞库,据我了解,它应该能够像现在一样并行下载多个文件。请记住,您正在下载看起来很多文件,因此自然可能需要一段时间。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# A scrapy script to download issues of the Gaceta de Nicaragua (1843-1961)
# virtualenv -p python3 envname
# source envname/bin/activate
# scrapy runspider gaceta_downloader.py
import errno
import json
import os
from datetime import datetime
import scrapy
from scrapy import FormRequest, Request
pwd="/Downloads"
os.chdir(pwd) # this will change directory to pwd path.
print((os.getcwd()))

# date range, format DD/MM/YYYY
start = '16/01/1844'
end = '01/01/1900'
date_format = '%d/%m/%Y'
start = datetime.strptime(start, date_format)
end = datetime.strptime(end, date_format)

class AsambleaSpider(scrapy.Spider):
name = 'asamblea'
allowed_domains = ['asamblea.gob.ni']
start_urls = ['http://digesto.asamblea.gob.ni/consultas/coleccion/']
papers = {
"Diario Oficial": "28",
}
def parse(self, response):
for key, value in list(self.papers.items()):
yield FormRequest(url='http://digesto.asamblea.gob.ni/consultas/util/ws/proxy.php',
headers= {
'X-Requested-With': 'XMLHttpRequest'
}, formdata= {
'hddQueryType': 'initgetRdds',
'cole': value
}
, meta={'paper': key},
callback=self.parse_rdds
)
pass
def parse_rdds(self, response):
data = json.loads(response.body_as_unicode())
for r in data["rdds"]:
if not r['fecPublica']:
continue
r_date = datetime.strptime(r['fecPublica'], date_format)
if start <= r_date <= end:
r['paper'] = response.meta['paper']
rddid = r['rddid']
yield Request("http://digesto.asamblea.gob.ni/consultas/util/pdf.php?type=rdd&rdd=" + rddid,
callback=self.download_pdf, meta=r)
def download_pdf(self, response):
filename = "{paper}/{anio}/".format(**response.meta) + "{titulo}-{fecPublica}.pdf".format(**response.meta).replace("/", "_")
if not os.path.exists(os.path.dirname(filename)):
try:
os.makedirs(os.path.dirname(filename))
except OSError as exc:  # guard against race condition
if exc.errno != errno.EEXIST:
raise
with open(filename, 'wb') as f:
f.write(response.body)

最新更新