我正在尝试使用scrapy + selenium来刮当我们向下滚动时动态加载数据的网页。我尝试了下面的代码,但我没有得到页面源,我被困在一个循环。
import scrapy
from bs4 import BeautifulSoup
import re
from ..items import ManualScrapingItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from scrapy.selector import Selector
class MyspiderSpider(scrapy.Spider):
name = 'manuals'
start_urls = ['https://www.makemytrip.com/hotels/hotel-listing/?checkin=10272021&checkout=10302021&locusId=CTMAA&locusType=city&city=CTMAA&country=IN&searchText=Chennai%20Egmore%20Railway%20Station%2C%20Chennai&roomStayQualifier=1e0e&_uCurrency=INR&mmPoiTag=POI%7CChennai%20Egmore%20Railway%20Station%7CPOI49106%7C13.07817%7C80.25923&reference=hotel&filterData=FREE_BREAKFAST_AVAIL%7CBREAKFAST_AVAIL&type=poi']
def __init__(self):
self.driver = webdriver.Firefox(executable_path="D:Desktopworkmanual_repeatergeckodriver.exe")
def parse(self, response):
self.driver.get(response.url)
time.sleep(5)
prev_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(5)
new_height =self.driver.execute_script("return document.body.scrollHeight")
if new_height == prev_height:
break
scrapy_selector = Selector(text = self.driver.page_source)
yield{"name":scrapy_selector}
您需要在while
循环中更新prev_height
:
import scrapy
from bs4 import BeautifulSoup
import re
from ..items import ManualScrapingItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from scrapy.selector import Selector
class MyspiderSpider(scrapy.Spider):
name = 'manuals'
start_urls = ['https://www.makemytrip.com/hotels/hotel-listing/?checkin=10272021&checkout=10302021&locusId=CTMAA&locusType=city&city=CTMAA&country=IN&searchText=Chennai%20Egmore%20Railway%20Station%2C%20Chennai&roomStayQualifier=1e0e&_uCurrency=INR&mmPoiTag=POI%7CChennai%20Egmore%20Railway%20Station%7CPOI49106%7C13.07817%7C80.25923&reference=hotel&filterData=FREE_BREAKFAST_AVAIL%7CBREAKFAST_AVAIL&type=poi']
def __init__(self):
self.driver = webdriver.Firefox(executable_path="D:Desktopworkmanual_repeatergeckodriver.exe")
def parse(self, response):
self.driver.get(response.url)
time.sleep(5)
new_height = driver.execute_script("return document.body.scrollHeight") # initialize new_neight first
while True:
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(5)
prev_height = new_height #update prev_height
new_height =self.driver.execute_script("return document.body.scrollHeight")
if new_height == prev_height:
break
scrapy_selector = Selector(text = self.driver.page_source)
yield{"name":scrapy_selector}