如何减轻在抓取数据时不规则发生的编码错误

我通过在该链接中运行"MDA Extractor.py"来抓取SEC EDGAR数据。https://github.com/rflugum/10K-MDA-Section

由于这个程序是在Python2中制作的，我更改了一些表达式(例如，print->print((，xrange->range(，并添加了useragent以避免阻塞。

同时，当它读取链接编号"39126"时(https://www.sec.gov/Archives/edgar/data/30302/0000030302-02-000003.txt)，

生成以下错误：

['39126', 'edgar/data/30302/0000030302-02-000003.txt']
Traceback (most recent call last):
File "MDAExtractor.py", line 261, in (module) headerclean(temp, temp1) 
File "MDAExtractor.py", line 112, in headerclean **for** x, line in enumerate(hand):
File "/usr/lib/python3.10/codecs.py", line 322, in decode (result, consumed) = self._buffer_decode(data, self.**errors**, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x97 in position 467: invalid start byte

我很困惑，因为在阅读链接39126之前，它运行得很好。我试着加上"；编码＝"utf-16"；到一些潜在的地方，但它产生了错误。我想知道如何处理这一问题，并制定一个更广泛适用的代码。

与错误相关的行如下：


#################################################################################
#This is the file that records the number of sections for each respective filing.
#################################################################################
LOG=os.path.join(filepath,"DOWNLOADLOG.txt")
with open(LOG,'w') as f:
f.write("FilertSECTIONSn")
f.close()
######## Download the filing ############
headers = {'User-Agent': 'A B@C.D'}
with open(download, 'r') as txtfile:
reader = csv.reader(txtfile, delimiter=',')
for line in reader:
print(line)
FileNUM=line[0].strip()
Filer=os.path.join(filepath, "MDA_processed/"+str(line[0].strip())+".txt")
url = 'https://www.sec.gov/Archives/' + line[1].strip()
with open(temp, 'wb') as f:
f.write(requests.get('%s' % url, headers = headers).content)
f.close()

##### Obtain Header Information on Filing ######################        

parse(temp, Filer)
headerclean(temp, temp1)   ####### LINE 261 !!!!!!!#####

##### ASCII Section ######################        

with open(temp,'r') as f:
str1=f.read()
output=str1
locations_xbrlbig=xbrl_clean("<type>zip", "</document>", output)
locations_xbrlbig.append(len(output))

if locations_xbrlbig!=[]:
str1=""
if len(locations_xbrlbig)%2==0:
for i in range(0,len(locations_xbrlbig),2):
str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]
f.close
output=str1
locations_xbrlbig=xbrl_clean("<type>graphic", "</document>", output)
locations_xbrlbig.append(len(output))

if locations_xbrlbig!=[0]:
str1=""
if len(locations_xbrlbig)%2==0:
for i in range(0,len(locations_xbrlbig),2):
str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]

output=str1
locations_xbrlbig=xbrl_clean("<type>excel", "</document>", output)
locations_xbrlbig.append(len(output))

if locations_xbrlbig!=[0]:
str1=""
if len(locations_xbrlbig)%2==0:
for i in range(0,len(locations_xbrlbig),2):
str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]

output=str1
locations_xbrlbig=xbrl_clean("<type>pdf", "</document>", output)
locations_xbrlbig.append(len(output))

if locations_xbrlbig!=[0]:
str1=""
if len(locations_xbrlbig)%2==0:
for i in range(0,len(locations_xbrlbig),2):
str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]

output=str1
locations_xbrlbig=xbrl_clean("<type>xml", "</document>", output)
locations_xbrlbig.append(len(output))

if locations_xbrlbig!=[0]:
str1=""
if len(locations_xbrlbig)%2==0:
for i in range(0,len(locations_xbrlbig),2):
str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]
output=str1
locations_xbrlbig=xbrl_clean("<type>ex", "</document>", output)
locations_xbrlbig.append(len(output))

if locations_xbrlbig!=[0]:
str1=""
if len(locations_xbrlbig)%2==0:
for i in range(0,len(locations_xbrlbig),2):
str1=str1+output[locations_xbrlbig[i]:locations_xbrlbig[i+1]]


###########################  DELETE HEADER INFORMATION  #######################################
def headerclean(temp, temp1):
mark0=0
strings1=['</SEC-HEADER>','</IMS-HEADER>']
hand=open(temp)
hand.seek(0)
for x, line in enumerate(hand):
line=line.strip()
if any(s in line for s in strings1):
mark0=x
break
hand.seek(0)

newfile=open(temp1,'w')
for x, line in enumerate(hand):   ###### LINE 112 !!!!!##########
if x>mark0:
newfile.write(line)
hand.close()
newfile.close()

newfile=open(temp1,'r')
hand=open(temp,'w')        
for line in newfile:
if "END PRIVACY-ENHANCED MESSAGE" not in line:
hand.write(line)                
hand.close()                
newfile.close()

from bs4 import BeautifulSoup
headers = {'user-agent': '`A B@C.D`'}
with open(download, 'r') as txtfile:
reader = csv.reader(txtfile, delimiter=',')
for line in reader:
print(line)
FileNUM = line[0].strip()
Filer = os.path.join(filepath,"MDA_processed/"+str(line[0].strip()) + ".txt")
url = 'https://www.sec.gov/Archives/' + line[1].strip()
response = requests.get('%s' % url, headers=headers)
soup = BeautifulSoup(response.content, 'lxml')  
filing_document = soup.body.text.encode('utf-8').decode('ascii', 'ignore')
#print(filing_document)

with open(temp, 'wb') as f:
f.write(filing_document)
f.close()

相关内容

最新更新

热门标签：