我在GitHub上保存了几个带有表格格式数据的HTML文件,现在我想将数据读取到一个EXCEL工作簿中。
由于pd.ExcelWriter('output.xlsx', mode='a')
显然已被弃用,我现在尝试使用openpyxl
。
这是我的代码:
# import packages
import pandas as pd
from urllib.parse import urljoin
from openpyxl import load_workbook
# define EXCEL file for output data
book = load_workbook("C:\Users\###\Downloads\VMA2013_raw-data.xlsx")
writer = pd.ExcelWriter("C:\Users\###\Downloads\VMA2013_raw-data.xlsx", engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
# files with WBRA data on Github
filenames= ("WebDataRA-2022-09-21T11_52_44.html",
"WebDataRA-2022-09-21T11_54_00.html",
"WebDataRA-2022-09-21T11_55_33.html",
"WebDataRA-2022-09-21T11_59_16.html",
"WebDataRA-2022-09-21T12_07_45.html",
"WebDataRA-2022-09-21T12_08_42.html",
"WebDataRA-2022-09-21T12_10_20.html",
"WebDataRA-2022-09-21T12_15_34.html",
"WebDataRA-2022-09-21T12_16_50.html",
"WebDataRA-2022-09-21T12_17_43.html",
"WebDataRA-2022-09-21T12_18_29.html",
"WebDataRA-2022-09-21T12_19_25.html",
"WebDataRA-2022-09-21T12_20_53.html",
"WebDataRA-2022-09-21T12_21_46.html",
"WebDataRA-2022-09-21T12_23_01.html",
"WebDataRA-2022-09-21T12_24_06.html",
"WebDataRA-2022-09-21T12_25_53.html",
"WebDataRA-2022-09-21T12_26_53.html",
"WebDataRA-2022-09-21T12_27_19.html",
"WebDataRA-2022-09-21T12_28_28.html",
"WebDataRA-2022-09-21T12_28_50.html",
"WebDataRA-2022-09-21T12_30_41.html",
"WebDataRA-2022-09-21T12_31_38.html",
"WebDataRA-2022-09-21T12_32_47.html",
"WebDataRA-2022-09-21T12_33_18.html",
"WebDataRA-2022-09-21T12_34_00.html",
"WebDataRA-2022-09-21T12_34_20.html",
"WebDataRA-2022-09-21T12_34_49.html",
"WebDataRA-2022-09-21T12_35_04.html",
"WebDataRA-2022-09-21T12_36_33.html",
"WebDataRA-2022-09-21T12_37_04.html",
"WebDataRA-2022-09-21T12_37_30.html",
"WebDataRA-2022-09-21T12_38_54.html",
"WebDataRA-2022-09-21T12_39_10.html",
"WebDataRA-2022-09-21T12_40_14.html",
"WebDataRA-2022-09-21T12_40_42.html",
"WebDataRA-2022-09-21T12_41_56.html",
"WebDataRA-2022-09-21T12_42_52.html",
"WebDataRA-2022-09-21T12_44_05.html",
"WebDataRA-2022-09-21T12_45_28.html",
"WebDataRA-backup-1000.html")
# Create URL for each file whose table we want to extract
path="https://raw.githubusercontent.com/####/*****/main/VMA2013_tweets/"
for f in filenames:
url=urljoin(path, f)
# Assign the table data to a Pandas dataframe
table = pd.read_html(url)[0]
# Store the dataframe in Excel file
startrow = writer.sheets['VMA2013'].max_row
table.to_excel(writer, startrow,index = False)
print("done")
我收到的错误通知是:
BadZipFile Traceback (most recent call last)
~AppDataLocalTempipykernel_3488720253268.py in <module>
6 # define EXCEL file for output data
7
----> 8 book = load_workbook("C:\Users\###\Downloads\VMA2013_raw-data.xlsx")
9 writer = pandas.ExcelWriter("C:\Users\###\Downloads\VMA2013_raw-data.xlsx", engine='openpyxl')
10 writer.book = book
C:ProgramDataAnaconda3libsite-packagesopenpyxlreaderexcel.py in load_workbook(filename, read_only, keep_vba, data_only, keep_links)
314 """
315 reader = ExcelReader(filename, read_only, keep_vba,
--> 316 data_only, keep_links)
317 reader.read()
318 return reader.wb
C:ProgramDataAnaconda3libsite-packagesopenpyxlreaderexcel.py in __init__(self, fn, read_only, keep_vba, data_only, keep_links)
122 def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA,
123 data_only=False, keep_links=True):
--> 124 self.archive = _validate_archive(fn)
125 self.valid_files = self.archive.namelist()
126 self.read_only = read_only
C:ProgramDataAnaconda3libsite-packagesopenpyxlreaderexcel.py in _validate_archive(filename)
94 raise InvalidFileException(msg)
95
---> 96 archive = ZipFile(filename, 'r')
97 return archive
98
C:ProgramDataAnaconda3libzipfile.py in __init__(self, file, mode, compression, allowZip64, compresslevel)
1256 try:
1257 if mode == 'r':
-> 1258 self._RealGetContents()
1259 elif mode in ('w', 'x'):
1260 # set the modified flag so central directory gets written
C:ProgramDataAnaconda3libzipfile.py in _RealGetContents(self)
1323 raise BadZipFile("File is not a zip file")
1324 if not endrec:
-> 1325 raise BadZipFile("File is not a zip file")
1326 if self.debug > 1:
1327 print(endrec)
BadZipFile: File is not a zip file
我搞不懂。为什么需要一个zip文件?EXCEL工作簿已按指定创建在我的下载文件夹中,但当我尝试打开它时,文件已损坏。
当我事先创建所需的文件时,错误通知如下:
TypeError Traceback (most recent call last)
~AppDataLocalTempipykernel_34881843287704.py in <module>
70 startrow = writer.sheets['VMA2013'].max_row
71
---> 72 table.to_excel(writer, startrow,index = False)
73
74 print("done")
C:ProgramDataAnaconda3libsite-packagespandascoregeneric.py in to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep, verbose, freeze_panes, storage_options)
2289 freeze_panes=freeze_panes,
2290 engine=engine,
-> 2291 storage_options=storage_options,
2292 )
2293
C:ProgramDataAnaconda3libsite-packagespandasioformatsexcel.py in write(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options)
843 startrow=startrow,
844 startcol=startcol,
--> 845 freeze_panes=freeze_panes,
846 )
847 finally:
C:ProgramDataAnaconda3libsite-packagespandasioexcel_openpyxl.py in write_cells(self, cells, sheet_name, startrow, startcol, freeze_panes)
447 else:
448 wks = self.book.create_sheet()
--> 449 wks.title = sheet_name
450 self.sheets[sheet_name] = wks
451
C:ProgramDataAnaconda3libsite-packagesopenpyxlworkbookchild.py in title(self, value)
88 raise ValueError("Worksheet titles must be str")
89
---> 90 m = INVALID_TITLE_REGEX.search(value)
91 if m:
92 msg = "Invalid character {0} found in sheet title".format(m.group(0))
TypeError: expected string or bytes-like object
由于我无法让openpyxl
工作,我现在正在将HTML数据写入单独的EXCEL文件中,之后我将合并这些文件:
# import packages
import pandas as pd
from urllib.parse import urljoin
import os
from openpyxl import load_workbook
# define EXCEL file for output data
outpath = "C:\Users\###\Downloads\"
# files with WBRA data on Github
filenames= ("WebDataRA-2022-09-21T11_52_44.html",
"WebDataRA-2022-09-21T11_54_00.html",
"WebDataRA-2022-09-21T11_55_33.html",
"WebDataRA-2022-09-21T11_59_16.html",
"WebDataRA-2022-09-21T12_07_45.html",
"WebDataRA-2022-09-21T12_08_42.html",
"WebDataRA-2022-09-21T12_10_20.html",
"WebDataRA-2022-09-21T12_15_34.html",
"WebDataRA-2022-09-21T12_16_50.html",
"WebDataRA-2022-09-21T12_17_43.html",
"WebDataRA-2022-09-21T12_18_29.html",
"WebDataRA-2022-09-21T12_19_25.html",
"WebDataRA-2022-09-21T12_20_53.html",
"WebDataRA-2022-09-21T12_21_46.html",
"WebDataRA-2022-09-21T12_23_01.html",
"WebDataRA-2022-09-21T12_24_06.html",
"WebDataRA-2022-09-21T12_25_53.html",
"WebDataRA-2022-09-21T12_26_53.html",
"WebDataRA-2022-09-21T12_27_19.html",
"WebDataRA-2022-09-21T12_28_28.html",
"WebDataRA-2022-09-21T12_28_50.html",
"WebDataRA-2022-09-21T12_30_41.html",
"WebDataRA-2022-09-21T12_31_38.html",
"WebDataRA-2022-09-21T12_32_47.html",
"WebDataRA-2022-09-21T12_33_18.html",
"WebDataRA-2022-09-21T12_34_00.html",
"WebDataRA-2022-09-21T12_34_20.html",
"WebDataRA-2022-09-21T12_34_49.html",
"WebDataRA-2022-09-21T12_35_04.html",
"WebDataRA-2022-09-21T12_36_33.html",
"WebDataRA-2022-09-21T12_37_04.html",
"WebDataRA-2022-09-21T12_37_30.html",
"WebDataRA-2022-09-21T12_38_54.html",
"WebDataRA-2022-09-21T12_39_10.html",
"WebDataRA-2022-09-21T12_40_14.html",
"WebDataRA-2022-09-21T12_40_42.html",
"WebDataRA-2022-09-21T12_41_56.html",
"WebDataRA-2022-09-21T12_42_52.html",
"WebDataRA-2022-09-21T12_44_05.html",
"WebDataRA-2022-09-21T12_45_28.html",
"WebDataRA-backup-1000.html")
# Create webpage URL for each file whose table we want to extract
path="https://raw.githubusercontent.com/###/***/main/VMA2013_tweets/"
counter=0
for f in filenames:
url=urljoin(path, f)
print(url)
counter+= 1
# Assign the table data to a Pandas dataframe
table = pd.read_html(url)[0]
outfile=os.path.join(outpath, str(counter) + ".xlsx")
print(outfile)
# Store the dataframe in NEW Excel file
table.to_excel(outfile)
print("done")