使用pandas-python将html表转换为csv

这就是我的代码运行良好。

import pandas as pd
html_data = """<table id="example" class="table table-hover dataTable no-footer" role="grid" aria-describedby="example_info">
<thead>
<tr role="row"><th class="sorting_desc" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-sort="descending" aria-label="Start Date/Time: activate to sort column ascending">Start Date/Time</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="End Date/Time: activate to sort column ascending">End Date/Time</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="Caller Name: activate to sort column ascending">Caller Name</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="Caller Number: activate to sort column ascending">Caller Number</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="Callee: activate to sort column ascending">Callee</th><th class="sorting" tabindex="0" aria-controls="example" rowspan="1" colspan="1" aria-label="Used Mins.: activate to sort column ascending">Used Mins.</th><th class="text-center sorting_disabled" rowspan="1" colspan="1" aria-label="File">File</th></tr>
</thead>
<tbody>
<tr role="row" class="odd"><td class="sorting_1">2020-11-27 12:50:23</td><td>2020-11-27 12:51:04</td><td>ABC 3</td><td>7111</td><td>923333222</td><td>1</td><td class=" text-center"><audio controls="">
<source src="../record_files_out/3/2020/oc_1.wav.wav" type="audio/ogg">
<source src="../record_files_out/358/2020-11-27/oc_1934553_358.wav.wav" type="audio/mpeg">
Your browser does not support the audio element.
</audio></td></tr></tbody>
</table>
"""
print(pd.read_html(html_data)[0].to_csv(index=False, header=True))

这是输出

2020-11-27 12:50:23,2020-11-27 12:51:04,ABC 3,7111,923333222,1,Your browser does not support the audio element.

但我想提取

../record_files_out/3/2020/oc_1.wav.wav

而不是这个

Your browser does not support the audio element.

我建议您查看此推荐选项：

# Importing the required modules  
import os 
import sys 
import pandas as pd 
from bs4 import BeautifulSoup 

path = 'html.html'

# empty list 
data = [] 

# for getting the header from 
# the HTML file 
list_header = [] 
soup = BeautifulSoup(open(path),'html.parser') 
header = soup.find_all("table")[0].find("tr") 

for items in header: 
try: 
list_header.append(items.get_text()) 
except: 
continue

# for getting the data  
HTML_data = soup.find_all("table")[0].find_all("tr")[1:] 

for element in HTML_data: 
sub_data = [] 
for sub_element in element: 
try: 
sub_data.append(sub_element.get_text()) 
except: 
continue
data.append(sub_data) 

# Storing the data into Pandas 
# DataFrame  
dataFrame = pd.DataFrame(data = data, columns = list_header) 

# Converting Pandas DataFrame 
# into CSV file 
dataFrame.to_csv('Geeks.csv')

相关内容

最新更新

热门标签：