Selenium使用scratchy Selenium模块从多个JavaScript页面上抓取数据



你好,现代世界的英雄们,

我目前正在抓取这个基于JS的网页https://golden.com/list-of-cryptocurrency-companies/这是我迄今为止实现的代码

import scrapy
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

class ScrapperSpider(scrapy.Spider):
name = 'scrapper'
allowed_domains = ['golden.com']
start_urls = ['https://golden.com/list-of-cryptocurrency-companies/']
current_page = 1

def __init__(self):

chrome_path = which('chromedriver')
self.driver = webdriver.Chrome(executable_path=chrome_path)  

def parse(self, response):
driver = self.driver 
number_of_pages = 27
for i in range(number_of_pages): 
url = 'https://golden.com/list-of-cryptocurrency-companies/'
driver.get(url + str(i+1))
driver.set_window_size(1920, 1080)
all_results = driver.find_element_by_xpath("//select[contains(@class, 'PageSize')]/option[3]").click()
new_table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "NewTable__body")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
import time
time.sleep(5)
driver.implicitly_wait(10)
# driver.find_element
self.html = driver.page_source

resp = Selector(text=self.html)
for currency in resp.xpath("//div[@class='NewTable__body']/div"):
exchange_name = currency.xpath('.//div[1]/div/div/div/span/a/span/text()').get()
website = currency.xpath(".//div[3]/div/div/div/div/span/a/@href").get()
industry_type = currency.xpath(".//div[4]/div/div/div/div")
for industry in industry_type:
industry_1 = industry.xpath(".//div[1]/span/a/span/text()").get()
industry_2 = industry.xpath(".//div[2]/span/a/span/text()").get()
industry_3 = industry.xpath(".//div[3]/span/a/span/text()").get()
industry_4 = industry.xpath(".//div[4]/span/a/span/text()").get()
industry_5 = industry.xpath(".//div[5]/span/a/span/text()").get()


location = currency.xpath(".//div[5]/div/div/div/div/div/span/a/span/text()").get()

yield {
'ex_name': exchange_name,
'url': website,
'industry_1': industry_1,
'industry_2': industry_2,
'industry_3': industry_3,
'industry_4': industry_4,
'indsutry_5': industry_5,
'location': location
}

driver.close()   
driver.quit()

我的主要问题是网页从https://golden.com/list-of-cryptocurrency-companies/到https://golden.com/list-of-cryptocurrency-companies/2然后立即返回到原始形式,而不会从任何其他页面上刮下任何其他内容。现在,就我的一生而言,我似乎无法理解发生了什么,因为我已经为此工作了整整一周。

如果有人能帮我,我将不胜感激,因为我真的对一无所知

下面是一个关于如何等待url更改的示例代码。这将从每一页中抓取公司名称。

number_of_pages = 27
for i in range(number_of_pages):
url = 'https://golden.com/list-of-cryptocurrency-companies/'+ str(i+1)
driver.get(url)
# wait upto 10 seconds for url changes
WebDriverWait(driver, 10).until(EC.url_to_be(url))
companies = driver.find_elements_by_xpath("//div[@class='QueryResults']//span[@class='TopicLink__text']")
print("Printing from page#" , i+1)
for company in companies:
print(company.text)


driver.close()   
driver.quit()   

这是输出:

Printing from page# 1
Temtum
CRYPTOCURRENCY
BLOCKCHAIN
Tortola
National Digital Asset Exchange Inc. (NDAX)
CRYPTOCURRENCY
...
Printing from page# 2
Dentacoin
CRYPTOCURRENCY
BLOCKCHAIN
HEALTHCARE
Netherlands
Waves Platform
...

最新更新