Scrapy:调试重定向(301)



在我得到错误之前"HTTP状态码未被处理或不被允许";,我修改了处于默认模式的USER_AGENT,现在我得到了这个错误:

import scrapy


class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)

def parse(self, response):
items =  response.xpath(
'//div[contains(@class,"section_OLXad-list")]//li[contains'
'(@class,"item")]'
)
for item in items:
url = item.xpath(
".//a[contains(@class,'OLXad-list-link')]/@href"
).extract_first()
yield scrapy.Request(url=url, callback=self.parse_detail)

next_page = response.xpath(
'//li[contains(@class,"item next")]//a/@href'
).extract_first()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)

def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath(
'//div[contains(@class,"photos")]//a/@href'
).extract()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath(
'normalize-space(//h1[contains(@id,"ad_title")]//.)'
).extract_first()
item['price'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-price")]'
'//span[contains(@class,"actual-price")]//.)'
).extract_first()
item['details'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item

试图在终端中执行.py文件,我得到以下消息:

2022-01-13 12:36:36 [scrapy.core.engine] INFO: Spider opened
2022-01-13 12:36:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-01-13 12:36:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/robots.txt> from <GET http://pe.olx.com.br/robots.txt>
2022-01-13 12:36:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://pe.olx.com.br/robots.txt> (referer: None)
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/imoveis/aluguel> from <GET http://pe.olx.com.br/imoveis/aluguel>

有人知道是什么导致了这个问题吗?

附言:我已经尝试了这些解决方案Python Scrapy 301重定向

它只是从http重定向到https,所以没有问题。

您的xpath完全错误。我在parse中修复了它,并在parse_detail中修复了3个xpath作为示例,但您需要修复其余部分。

import scrapy

class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
items = response.xpath('//ul[@id="ad-list"]/li')
for item in items:
url = item.xpath('.//a/@href').get()
if url:
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath('//a[@data-lurker-detail="next_page"]/@href').get()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath('//img[@class="image "]/@src').get()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath('//h1/text()').get()
item['price'] = response.xpath('//h2/text()').get()
item['details'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item

最新更新