如何使用scrapy与硒刮无限滚动页面?

我正在尝试使用scrapy + selenium来刮当我们向下滚动时动态加载数据的网页。我尝试了下面的代码，但我没有得到页面源，我被困在一个循环。

import scrapy
from bs4 import BeautifulSoup
import re
from ..items import ManualScrapingItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from scrapy.selector import Selector
class MyspiderSpider(scrapy.Spider):
name = 'manuals'

start_urls = ['https://www.makemytrip.com/hotels/hotel-listing/?checkin=10272021&checkout=10302021&locusId=CTMAA&locusType=city&city=CTMAA&country=IN&searchText=Chennai%20Egmore%20Railway%20Station%2C%20Chennai&roomStayQualifier=1e0e&_uCurrency=INR&mmPoiTag=POI%7CChennai%20Egmore%20Railway%20Station%7CPOI49106%7C13.07817%7C80.25923&reference=hotel&filterData=FREE_BREAKFAST_AVAIL%7CBREAKFAST_AVAIL&type=poi']

def __init__(self):
self.driver = webdriver.Firefox(executable_path="D:Desktopworkmanual_repeatergeckodriver.exe")
def parse(self, response):

self.driver.get(response.url)
time.sleep(5)
prev_height = self.driver.execute_script("return document.body.scrollHeight")

while True:
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(5)
new_height =self.driver.execute_script("return document.body.scrollHeight")

if new_height == prev_height:            
break        
scrapy_selector = Selector(text = self.driver.page_source) 

yield{"name":scrapy_selector}

您需要在while循环中更新prev_height:

import scrapy
from bs4 import BeautifulSoup
import re
from ..items import ManualScrapingItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from scrapy.selector import Selector
class MyspiderSpider(scrapy.Spider):
name = 'manuals'

start_urls = ['https://www.makemytrip.com/hotels/hotel-listing/?checkin=10272021&checkout=10302021&locusId=CTMAA&locusType=city&city=CTMAA&country=IN&searchText=Chennai%20Egmore%20Railway%20Station%2C%20Chennai&roomStayQualifier=1e0e&_uCurrency=INR&mmPoiTag=POI%7CChennai%20Egmore%20Railway%20Station%7CPOI49106%7C13.07817%7C80.25923&reference=hotel&filterData=FREE_BREAKFAST_AVAIL%7CBREAKFAST_AVAIL&type=poi']

def __init__(self):
self.driver = webdriver.Firefox(executable_path="D:Desktopworkmanual_repeatergeckodriver.exe")
def parse(self, response):

self.driver.get(response.url)
time.sleep(5)
new_height = driver.execute_script("return document.body.scrollHeight") # initialize new_neight first

while True:
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(5)
prev_height = new_height #update prev_height
new_height =self.driver.execute_script("return document.body.scrollHeight")

if new_height == prev_height:            
break        
scrapy_selector = Selector(text = self.driver.page_source) 

yield{"name":scrapy_selector}

相关内容

最新更新

热门标签：