为什么我的刮板无法从下一页提取数据



嗨,我用scrapy框架构建了一个刮刀,它在第一页上完美地工作,但即使在编写代码从下一页抓取后,也无法从下一页获得相同的数据。我的代码哪里出错了。我的items.py文件也运行正常。

这是我的代码

import scrapy
from amazonscraper.items import AmazonscraperItem
from scrapy.loader import ItemLoader

class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):

products = response.css('div.sg-col-4-of-12')
for product in products:

l = ItemLoader(item = AmazonscraperItem(), selector = product )  

l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')

yield l.load_item()

next_page = response.xpath('//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[52]/div/div/span/a/@href').get()
if next_page is not None:
next_page_url = 'https://www.amazon.com' + next_page
yield response.follow(next_page_url, callback = self.parse)

这是我的AmazonScraperItem

import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags

class AmazonscraperItem(scrapy.Item):
name = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
price = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())
review = scrapy.Field(input_processor = MapCompose(remove_tags), output_processor = TakeFirst())

我已经修复了这个问题。代码有个技术错误。我更新了一些东西。我已经更新了下一页选择器,以获得正确的URL。其次,我们不需要在发送请求时附加任何URL,因为您正在使用response.follow。响应。follow会自动将相对URL转换为绝对URL。下面的代码适用于多个页面(所有分页)。

class AmazonspiderSpider(scrapy.Spider):
name = 'amazonspider'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/s?i=fashion-womens-intl-ship&bbn=16225018011&rh=n%3A16225018011%2Cn%3A1040660%2Cn%3A1045024&pd_rd_r=2da30763-bfe6-4a38-b17a-77236fa718c5&pd_rd_w=JtaUW&pd_rd_wg=BtgRm&pf_rd_p=6a92dcea-e071-4bb9-866a-369bc067390d&pf_rd_r=86NBFKV4TA7CCSEVNBM7&qid=1671522114&rnid=1040660&ref=sr_pg_1']
def parse(self, response):
products = response.css('div.sg-col-4-of-12')
for product in products:

l = ItemLoader(item = AmazonscraperItem(), selector = product )  

l.add_css('name', 'a.a-link-normal span.a-size-base-plus')
l.add_css('price', 'span.a-price span.a-offscreen')
l.add_css('review', 'i.a-icon span.a-icon-alt')

yield l.load_item()

next_page = response.css('.s-pagination-next ::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)

最新更新