美汤错误:"openpyxl.utils.exceptions.illegalcharactererror"



我试图从本地保存在我的硬盘上的html文件提取文本。然后将它们粘贴到excel文件的每一行中。在Mac上执行此操作,完整代码如下:

# install/import all prerequisites first
# from cgitb import text
from openpyxl import Workbook, load_workbook
from bs4 import BeautifulSoup

# create a question that asks how many files you have
i = 1
n = int(input("How many files ? "))
# final_n = n - 1

# the list of files
files = []

# the list of files only has 1 file contained by default
# while loop will create multiple files in the list so that I don't have to do the tedious work
while i <= n:
files.append("folder/SplashBidNoticeAbstractUI (" + str(i) +").html")
i = i+1

# load an existing Libreoffice Calc file
wb = Workbook()
ws = wb.active
ws.title = "Data"

# add the titles on the first row, each column with the respective title
ws.append(["DatePublished", "Closing Date", "Category", "Procuring Entity", "Approved Budget for the Contract", "Name", "Delivery Period", "Reference Number", "Title", "Area of Delivery", "Solicitation Number", "Contact"])

# the actual magic.
# extract desired data from the html files and then
# paste in the active Libreoffice Calc file
for i in files:
with open(i, "r", errors="ignore") as html_file:
content = html_file.read()  # does something
soup = BeautifulSoup(content, "html.parser")  # does something
# extracts data from the webpages

if soup.find("span", id="lblDisplayDatePublish") != None:
datePublished = soup.find("span", id="lblDisplayDatePublish").text
else: datePublished = ""
if soup.find("span", id="lblDisplayCloseDateTime") != None:
cd  = soup.find("span", id="lblDisplayCloseDateTime").text
else: cd = ""
if soup.find("span", id="lblDisplayCategory") != None:
cat = soup.find("span", id="lblDisplayCategory").text
else: cat = ""
if soup.find("span", id="lblDisplayProcuringEntity") != None:
pro_id = soup.find("span", id="lblDisplayProcuringEntity").text.replace("", "")
else: pro_id = ""
if soup.find("span", id="lblDisplayBudget") != None:
abc = soup.find("span", id="lblDisplayBudget").text
else: abc = ""
if soup.find("span", id="lblHeader") != None:
name = soup.find("span", id="lblHeader").text.replace(" ", "_").replace("n", "_")
else: name = ""
if soup.find("span", id="lblDisplayPeriod") != None:
delp = soup.find("span", id="lblDisplayPeriod").text
else: delp = ""
if soup.find("span", id="lblDisplayReferenceNo")!= None:
ref_num = soup.find("span", id="lblDisplayReferenceNo").text
else: ref_num = ""
if soup.find("span", id="lblDisplayTitle")!= None:
title = soup.find("span", id="lblDisplayTitle").text.replace(" ", "_").replace("n", "_")
else: title = ""
if soup.find("span", id="lblDisplayAOD") != None:
aod = soup.find("span", id="lblDisplayAOD").text.replace("n", "_")
else: aod = ""
if soup.find("span", id="lblDisplaySolNumber") != None:
solNr = soup.find("span", id="lblDisplaySolNumber").text
else: solNr = ""
if soup.find("span", id="lblDisplayContactPerson")!= None:
contact = soup.find("span", id="lblDisplayContactPerson").text
else: contact = ""
# just an assurance that the code worked and nothing screwed up
print("nBid" + i)
print("Date Published: " + datePublished)
print("Closing Date: " + cd)
print("Category : " + cat)
print("Procurement Entity : " + pro_id)
print("Name: " + name)
print("Delivery Period: " + delp)
print("ABC: " + abc)
print("Reference Number : " + ref_num)
print("Title : " + title)
print("Area of Delivery : " + aod)
print("Solicitation Number: "+ solNr)
print("Contact: "+ contact)
# pastes the data inside the calc file under the titles
ws.append([datePublished, cd, cat, pro_id, abc, name, delp, ref_num, title, aod, solNr, contact])
# saves file so work is safe and sound
filename = input("filename: ") 
wb.save(filename + ".xlsx")
print("Saved into '" + filename + ".xlsx'.") 

为所有37,820运行此代码html文件。

我已经尝试更新我的Python版本从3.93.103.11。我试过运行python3 phase3.pypython phase3.py。我还重新安装了bs4openpyxl。不过,问题还没有解决。下面是错误

openpyxl.utils.exceptions.illegalcharactererror

Openpyxl模块在您尝试将ASCII控制字符(例如,"x00","x01", ..)分配给单元格值时引发此exception。这意味着至少有一个html文件包含这种字符。因此,您需要使用str.encode来转义这些

替换为

ws.append([datePublished, cd, cat, pro_id, abc, name, delp, ref_num, title, aod, solNr, contact])

By this:

try:
list_of_vals= [datePublished, cd, cat, pro_id, abc, name, delp, ref_num, title, aod, solNr, contact]
ws.append(list_of_valls)
except openpyxl.utils.exceptions.illegalcharactererror:
ws.append([cell_val.encode("ascii") for cell_val in list_of_vals]

注意这里的缩进

最新更新