Scrapy 不会在响应.css上获取标记

我构建了一个简单的抓取蜘蛛，运行在抓取hub上：

class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def parse(self, response):
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)        
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response): 
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}

我面临的问题是，尽管我在浏览器端的标记中看到它，但multiple_locs_url响应.css返回一个空数组。

我用刮擦的外壳检查过，刮擦的外壳没有看到标记。我想这是由于加载页面时通过 javascript 呈现的标记。

我添加了飞溅，但这似乎不适用于响应。我将如何使查询变得棘手，直到页面加载完毕？

请参阅页面的源代码：view-source：pracuj.pl/praca/polska;CT，1 . html 代码中没有带有类"offer-regions__label"的元素。

此代码将始终返回一个空列表：

multiple_locs_urls = response.css('a.offer-regions__label::attr(href)')

但正如这里解释的那样 https://stackoverflow.com/a/17697329/9913319：

很多时候，在抓取时，我们会遇到问题，其中的内容呈现在页面上是用Javascript生成的，因此很棘手无法爬行。

在这种情况下，您可以使用硒。我更改了您的代码并检查了它，它可以工作：

class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def __init__( self, **kwargs ):
super().__init__( **kwargs )
profile = webdriver.FirefoxProfile( "pathToFirefoxProfile" )
firefox_binary = "pathToFirefoxBinary"  # Must be the developer edition!!!
# self.driver = webdriver.Firefox()
self.driver = webdriver.Firefox( profile, firefox_binary = firefox_binary )
def parse(self, response):
self.driver.get( response.url )
elements = self.driver.find_elements_by_css_selector( "a.offer-details__title-link" )
self.driver.get( response.url )
for element in elements:
print( "****" )
print( str( element.get_attribute( "href" ) ) )
print( str( element.text ) )
# your old code below
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)        
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response): 
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}

相关内容

最新更新

热门标签：