我正在开发一个从动态网站中提取html表的脚本。下面是我的脚本:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import time
import sys
driver = webdriver.Chrome()
driver.implicitly_wait(20)
URL = 'https://www.ccee.org.br/portal/faces/pages_publico/o-que-fazemos/como_ccee_atua/precos/precos_medios?_adf.ctrl-state=7e1fw5zdn_14&_afrLoop=19197915280379#!%40%40%3F_afrLoop%3D19197915280379%26_adf.ctrl-state%3D7e1fw5zdn_18'
driver.get(URL)
time.sleep(50)
soup = BeautifulSoup(driver.page_source, "html.parser")
table = soup.find('html')
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll(["td"]):
text = cell.text
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
for item in list_of_rows:
' '.join(item)
Data = pd.DataFrame(list_of_rows)
Data.to_csv('Data.csv' ,index = False)
driver.quit()
我已经使用Selenium来提取,但未能获得URL网页中的表。当我运行这个脚本时,我得到如下表:
0 1 2 3 4 5
0
1 None None None
2 None None None
3 OK OK None
4 OK None None None None
5 None
6 None None None None
7 None None None
8 OKCancel OKCancel None
9 OKCancel None None None None
我已经修改了您的代码,现在可以正确导出表了。
- 主要问题可能是您的表位于
iframe
中在与页面进行任何进一步交互之前,您需要切换到 BeatifulSoup
cell.text
包括"\n"、"\t"字符使用regex删除- 查看更多在线评论,如果您有问题,请告诉我
解决方案:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import re
driver = webdriver.Chrome()
URL = 'https://www.ccee.org.br/portal/faces/pages_publico/o-que-fazemos/como_ccee_atua/precos/precos_medios?_adf.ctrl-state=7e1fw5zdn_14&_afrLoop=19197915280379#!%40%40%3F_afrLoop%3D19197915280379%26_adf.ctrl-state%3D7e1fw5zdn_18'
driver.get(URL)
WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.ID,'pt1:myFrame'))) #wait for iframe to load
iframe=driver.switch_to.frame('pt1:myFrame')
WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH,"//table//thead/tr/th"))) # wait for table header to load
soup = BeautifulSoup(driver.page_source, "html.parser")
table = soup.find('html')
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll(["td"]):
text = re.sub(r'nt+', '', cell.text) #replace new line and tab with ''
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
for item in list_of_rows:
' '.join(item)
data = pd.DataFrame(list_of_rows)
data.dropna(axis = 0, how='any', inplace = True) # drop empty lines
header=['Mes','SE/CO','S','NE','N']
data.to_csv('Datax.csv', header=header, index = False)
driver.quit()