我对抓取相当陌生,一直在尝试直接从网站下载。csv文件。我设法修复了编辑的最后一个问题,但是在尝试下载文件时,我得到了一个新的错误。下面的错误是:
引发ValueError(f'请求url: {self._url}中缺少方案')ValueError:请求url: h中缺少方案
我不确定是什么触发了这个错误,因为链接正确地遵循下一个函数。
例如,下面是我尝试过的:
import scrapy
from nhs.items import DownfilesItem
class NhsScapeSpider(scrapy.Spider):
name = 'nhs_scape'
#allowed_domains = ['nh']
start_urls = ['https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/ae-attendances-and-emergency-admissions-2021-22/']
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url = url,
callback = self.parse
)
def parse(self, response):
side_panel = response.xpath("//aside[@class='subnav group minimal_nav desktop-only']//ul[@class='children']//li")
for years in side_panel:
year_links = years.xpath('.//a/@href').get()
yield response.follow(year_links, callback = self.download_files)
def download_files(self, response):
test_files = response.xpath("//article[@class='rich-text']//p")
month_files = response.xpath("//article[@class='rich-text']//h3")
for files, mn in zip(test_files, month_files):
all_files = files.xpath('.//a//@href').getall()
all_file_names = files.xpath('.//a//text()').getall()
month_year = mn.xpath('.//text()').get()
for ind_files,ind_text in zip(all_files, all_file_names):
item = DownfilesItem()
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = ind_files
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = ind_files
item['original_file_name'] = ind_text
yield item
Items.py:
import scrapy
class DownfilesItem(scrapy.Item):
# define the fields for your item here like:
file_urls = scrapy.Field()
original_file_name = scrapy.Field()
Pipelines.py:
from scrapy.pipelines.files import FilesPipeline
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[1]
return file_name
Settings.py:
ITEM_PIPELINES = {'nhs.pipelines.DownfilesPipeline': 150}
FILES_STORE = "Files"
更新了@supersuers answer后的错误:
IsADirectoryError: [Errno 21] Is a directory: 'Files/'
似乎这是由FILES_STORE = "Files"
引起的,但是当我删除这个时,我没有得到错误,但也没有下载文件。
item['file_urls']
应该是一个列表:
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
编辑:
第二个错误是由于管道,file_name
是一个空字符串,你可以改变它,例如:
file_name: str = request.url.split("/")[-1]
编辑2:
我认为问题出在xpath选择器上,试试这个并根据你的需要调整它:
import scrapy
from tempbuffer.items import DownfilesItem
class NhsScapeSpider(scrapy.Spider):
name = 'nhs_scape'
#allowed_domains = ['nh']
start_urls = ['https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/ae-attendances-and-emergency-admissions-2021-22/']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
side_panel = response.xpath("//aside[@class='subnav group minimal_nav desktop-only']//ul[@class='children']//li")
for years in side_panel:
year_links = years.xpath('.//a/@href').get()
yield response.follow(year_links, callback=self.download_files)
def download_files(self, response):
# test_files = response.xpath("//article[@class='rich-text']//p")
test_files = response.xpath("//article[@class='rich-text']//p[a[contains(@href, '.xls')]]")
# month_files = response.xpath("//article[@class='rich-text']//h3")
# couldn't make a prettier xpath selector
month_files = response.xpath("//article[@class='rich-text']//h3[starts-with(text(), 'January') or starts-with(text(), 'February') or starts-with(text(), 'March') or starts-with(text(), 'April') or starts-with(text(), 'May') or starts-with(text(), 'June') or starts-with(text(), 'July') or starts-with(text(), 'August') or starts-with(text(), 'September') or starts-with(text(), 'October') or starts-with(text(), 'November') or starts-with(text(), 'December')]")
for files, mn in zip(test_files, month_files):
all_files = files.xpath('.//a//@href').getall()
all_file_names = files.xpath('.//a//text()').getall()
month_year = mn.xpath('.//text()').get()
for ind_files, ind_text in zip(all_files, all_file_names):
item = DownfilesItem()
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item