如何从JavaScript网站抓取数据



我正试图从这个动态JavaScript网站上抓取数据。由于页面是动态的,我使用Selenium从表中提取数据。请建议我如何从动态表中提取数据。这是我的密码。

import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import lxml.html as LH
import requests
# specify the url
urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
print(urlpage)
# run firefox webdriver from executable path of your choice
driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
##driver = webdriver.Firefox(executable_path = 'C:/Users/Shresth Suman/Downloads/geckodriver-v0.26.0-win64/geckodriver.exe')
# get web page
driver.get(urlpage)
# execute script to scroll down the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
# sleep for 5s
time.sleep(5)
# driver.quit()

# find elements by xpath
##results = driver.find_elements_by_xpath("//div[@id='div_taboa']//table[@id='taboa']/tbody")
##results = driver.find_elements_by_xpath("//*[@id='page-title']")
##results = driver.find_elements_by_xpath("//*[@id='div_main']/h2[1]")
results = driver.find_elements_by_xpath("//*[@id = 'frame_historicos']")
print(results)
print(len(results))

# create empty array to store data
data = []
# loop over results
for result in results:
heading = result.text
print(heading)
headingfind = result.find_element_by_tag_name('h1')
# append dict to array
data.append({"head" : headingfind, "name" : heading})
# close driver 
driver.quit()
###################################################################

# save to pandas dataframe
df = pd.DataFrame(data)
print(df)
# write to csv
df.to_csv('testsot.csv')

我想提取2005年至今的数据,平均值/总计为10分钟,这只给我一个月的数据

  1. 诱导WebDriverWaitelement_to_be_clickable((
  2. 安装漂亮的汤库
  3. 使用熊猫read_html((
  4. 我还没有创建列表。您应该为1/1/2005之后的所有月份创建开始日期和结束日期列表以及itearte

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    import pandas as pd
    from bs4 import BeautifulSoup
    import time
    urlpage = 'http://www.sotaventogalicia.com/en/real-time-data/historical'
    driver = webdriver.Chrome('C:/Users/Shresth Suman/Downloads/chromedriver_win32/chromedriver.exe')
    driver.get(urlpage)
    WebDriverWait(driver,20).until(EC.frame_to_be_available_and_switch_to_it((By.ID,"frame_historicos")))
    inputstartdate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[@class='dijitReset dijitInputInner'])[1]")))
    inputstartdate.clear()
    inputstartdate.send_keys("1/1/2005")
    inputenddate=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"(//input[@class='dijitReset dijitInputInner'])[last()]")))
    inputenddate.clear()
    inputenddate.send_keys("1/31/2005")
    WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//input[@class='form-submit'][@value='REFRESH']"))).click()
    WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#taboa")))
    time.sleep(3)
    soup=BeautifulSoup(driver.page_source,"html.parser")
    table=soup.find("table", id="taboa")
    df=pd.read_html(str(table))
    df.to_csv('testsot.csv')
    print(df)
    

最新更新