无法抓取所有项目



在selenium和scrapy的帮助下,我从487个项目中只获得了12个项目。如何刮掉所有的物品。我不知道我哪里错了。任何人的帮助都是感激的。

URL

我代码:

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from selenium_stealth import stealth
from time import sleep

class CpcuSpider(CrawlSpider):
name = 'cp'
allowed_domains = ['www.arp.fr']
start_urls = [
'https://www.arp.fr/produits-portables-tablettes-ordinateurs-portables/?queryString=JTdCJTIyYXJlYUlkJTIyJTNBJTIyMkVEODhGMjctOTNFOS00NzQzLUI3NDYtRUNFQUJENUZFRDA4JTIyJTJDJTIyaXNRdWVyeSUyMiUzQWZhbHNlJTJDJTIyc29ydEF0dHJpYnV0ZSUyMiUzQW51bGwlMkMlMjJzb3J0RGlyZWN0aW9uJTIyJTNBbnVsbCUyQyUyMnBhZ2VubyUyMiUzQSUyMjElMjIlMkMlMjJwZXJQYWdlJTIyJTNBJTIyMTIlMjIlMkMlMjJ2YWx1ZXMlMjIlM0ElNUIlNUQlMkMlMjJwcm9kdWN0SWRzJTIyJTNBJTVCJTVEJTJDJTIycGFydG5lcklkJTIyJTNBbnVsbCUyQyUyMm9wdGlvbnMlMjIlM0ElNUJudWxsJTJDbnVsbCUyQ251bGwlNUQlN0Q=&page='+str(x)+'&productfilter=&sort=null' for x in range(1,6)]
rules = (
Rule(LinkExtractor(restrict_xpaths='//a[@class="rasEpicTitle rasElementReaction"]'), callback='parse_item', follow=False),
#Rule(LinkExtractor(restrict_xpaths='//*[@class="fielddata"]/a'), callback='parse_item', follow=True),
)
def __init__(self):
# this page loads
CrawlSpider.__init__(self)
chrome_path = which("chromedriver")
self.driver = webdriver.Chrome(executable_path=chrome_path)

print(dir(self.driver))
self.driver.maximize_window()
# self.driver.quit()
def parse_item(self, response):
self.driver.get(response.url)
sleep(5)
title = Selector(text=self.driver.page_source)
#for list_node in lists.xpath('//*[@class="rasEpicBoxContainer"]'):

yield{
'Title': title.xpath('//*[@title="028001007"]/text()').get()
}
#self.driver.close()




start_urls开始的代码中有许多错误。如果你检查网站,你会发现分页不能与URL一起工作。例如,您无法使用https://www.arp.fr/produits-portables-tablettes-ordinateurs-portables/?queryString=JTdCJTIyYXJlYUlkJTIyJTNBJTIyMkVEODhGMjctOTNFOS00NzQzLUI3NDYtRUNFQUJENUZFRDA4JTIyJTJDJTIyaXNRdWVyeSUyMiUzQWZhbHNlJTJDJTIyc29ydEF0dHJpYnV0ZSUyMiUzQW51bGwlMkMlMjJzb3J0RGlyZWN0aW9uJTIyJTNBbnVsbCUyQyUyMnBhZ2VubyUyMiUzQSUyMjElMjIlMkMlMjJwZXJQYWdlJTIyJTNBJTIyMTIlMjIlMkMlMjJ2YWx1ZXMlMjIlM0ElNUIlNUQlMkMlMjJwcm9kdWN0SWRzJTIyJTNBJTVCJTVEJTJDJTIycGFydG5lcklkJTIyJTNBbnVsbCUyQyUyMm9wdGlvbnMlMjIlM0ElNUJudWxsJTJDbnVsbCUyQ251bGwlNUQlN0Q=&page=3&productfilter=&sort=null加载第三个页面。你会得到一个首页。

我建议使用另一种方法:在Scrapy spider中模拟Javascript调用。内部网站使用调用一个特殊的URL来接收JSON,然后显示给你。我们可以尝试执行相同的操作:

import scrapy
import json
import base64
import urllib
from scrapy.http import HtmlResponse # to update response from a string
import chompjs # to parse Javascript object

def generate_query_string(query):
# Website send pagination and a query string using special HTTP header
# This header is Base64 encoded and URL encoded
query_string_raw = json.dumps(query)
query_string_urlencoded = urllib.parse.quote_plus(query_string_raw)
query_string = base64.b64encode(query_string_urlencoded.encode('ascii')).decode('ascii')
return query_string
class ArpSpider(scrapy.Spider):
name = '68943284'
# I got above query params from your URL using online Base64 decoder and next online URL encoder
# Most interesting that we can set 500 results per page and get everything in a SINGLE call!
query = {
"areaId": "2ED88F27-93E9-4743-B746-ECEABD5FED08", 
"isQuery": False, 
"sortAttribute": None, 
"sortDirection": None, 
"pageno": "1", 
"perPage": "500", 
"values": [], 
"productIds": ["5267337-05", "5393345-05", "5400545-05", "5400812-05", "5404575-05", "5409557-05", "5410466-05", "5412282-05", "5412314-05", "5412318-05", "5412323-05", "5421276-05"],
"partnerId": None, 
"options": [None, None, None]
}
def start_requests(self):
yield scrapy.Request(
url='https://www.arp.fr/filter/page.json',
headers={
'queryString': generate_query_string(self.query),
},
callback=self.parse
)

def parse(self, response):
# with open('Samples/Arp.json', 'wb') as f:
#    f.write(response.body)
# We need to parse JSON response and get HTML code from it
data = json.loads(response.text)
# print(data['products'])
response = HtmlResponse(url="My URL", body=data['products'], encoding='utf-8')
# Now we need to parse HTML and get Javascript object with all data we need
javascript = response.xpath('//script[contains(., "dataLayer.push")]/text()').re_first(r'dataLayer.push(([sS]+?));')
if javascript:
data = chompjs.parse_js_object(javascript)
for item in data['ecommerce']['impressions']:
name = item['name']
price = item['price']
print(item)

最新更新