如何从JavaScript网站抓取数据

我正试图从这个动态JavaScript网站上抓取数据。由于页面是动态的，我使用Selenium从表中提取数据。请建议我如何从动态表中提取数据。这是我的密码。

import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import lxml.html as LH
import requests
# specify the url
urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
print(urlpage)
# run firefox webdriver from executable path of your choice
driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
##driver = webdriver.Firefox(executable_path = 'C:/Users/Shresth Suman/Downloads/geckodriver-v0.26.0-win64/geckodriver.exe')
# get web page
driver.get(urlpage)
# execute script to scroll down the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
# sleep for 5s
time.sleep(5)
# driver.quit()

# find elements by xpath
##results = driver.find_elements_by_xpath("//div[@id='div_taboa']//table[@id='taboa']/tbody")
##results = driver.find_elements_by_xpath("//*[@id='page-title']")
##results = driver.find_elements_by_xpath("//*[@id='div_main']/h2[1]")
results = driver.find_elements_by_xpath("//*[@id = 'frame_historicos']")
print(results)
print(len(results))

# create empty array to store data
data = []
# loop over results
for result in results:
heading = result.text
print(heading)
headingfind = result.find_element_by_tag_name('h1')
# append dict to array
data.append({"head" : headingfind, "name" : heading})
# close driver 
driver.quit()
###################################################################

# save to pandas dataframe
df = pd.DataFrame(data)
print(df)
# write to csv
df.to_csv('testsot.csv')

我想提取2005年至今的数据，平均值/总计为10分钟，这只给我一个月的数据

诱导WebDriverWait和element_to_be_clickable((
安装漂亮的汤库
使用熊猫read_html((

我还没有创建列表。您应该为1/1/2005之后的所有月份创建开始日期和结束日期列表以及itearte

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
driver.get(urlpage)
WebDriverWait(driver,20).until(EC.frame_to_be_available_and_switch_to_it((By.ID,"frame_historicos")))
inputstartdate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[@class='dijitReset dijitInputInner'])[1]")))
inputstartdate.clear()
inputstartdate.send_keys("1/1/2005")
inputenddate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[@class='dijitReset dijitInputInner'])[last()]")))
inputenddate.clear()
inputenddate.send_keys("1/31/2005")
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//input[@class='form-submit'][@value='REFRESH']"))).click()
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#taboa")))
time.sleep(3)
soup=BeautifulSoup(driver.page_source,"html.parser")
table=soup.find("table", id="taboa")
df=pd.read_html(str(table))
df.to_csv('testsot.csv')
print(df)

相关内容

最新更新

热门标签：