我得到空的导出文件scrapy



我正在尝试写一个解析器抓取,但有些事情是错误的,可以帮助我什么错了?我用items.py链接蜘蛛

import scrapy
from dyplom.items import DyplomtwoItem
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")

def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = DyplomtwoItem()
item['id'] = response.xpath("//div[contains(@class, 'button button_print')]"
"//a[contains(@class, 'drop-down_item')]/@href").extract()[0]
item['title'] = response.xpath("//h1[contains(@class, 'recipe-header_name')]"
"/descendant::text()").extract()
item['image'] = response.xpath("//div[contains(@class, 'content-media')]/img//@src").extract()
item['recipe'] = response.xpath("//div[contains(@class, 'content-box_content')]/div[contains"
"(@class, 'plain-text recipe_step_text')]/descendant::text()").extract()
yield item

我在scraper中包含了您的项目中的类,以便我可以详细分析您所做的操作。它基本上和你的items.py是一样的。

原来你的选择器有一些问题,你没有选择所有的文本。您需要在配方中添加getall()而不是extract()

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from itemloaders import ItemLoader
#from dyplom.items import DyplomtwoItem
class DyplomItem(scrapy.Item):
id = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
image = Field(output_processor = TakeFirst())
recipe = Field()
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")

def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
loaders = ItemLoader(DyplomItem())
loaders.add_value('id', response.xpath("((//div[contains(@class, 'button button_print')])[1]//a)[1]/@href").get())
loaders.add_value('title', response.xpath("//div[@class='content-box']//h1//text()").get())
loaders.add_value('image', response.xpath("(//div[contains(@class, 'content-media')]//img/@src)[1]").get())
for text_stuff in response.xpath("//div[contains(@class, 'plain-text recipe_step_text')]/descendant::text()").getall():
loaders.add_value('recipe',text_stuff)
yield loaders.load_item()

输出:

{'id': '/retsepty/146847-skrembl-s-bekonom/print?wi=true',
'image': 'https://e3.edimdoma.ru/data/recipes/0014/6847/146847-ed4_wide.jpg?1631992625',
'recipe': ['Бекон нарежьте кубиком. Можно взять и сырокопченый, и '
'варенокопченый, и свежий бекон.',
'В сковороде растопите сливочное масло.'],
'title': 'Скрэмбл с беконом'}

最新更新