正在尝试分析Word文档并获取PdfReadError:找不到EOF标记



我正在测试一些Python代码,以循环浏览简历,打开每个简历,解析每个简历,并根据每个简历的内容创建一个全面的报告。这是我正在运行的代码。

#importing all required libraries
import PyPDF2
import os
from os import listdir
from os.path import isfile, join
from io import StringIO
import pandas as pd
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.matcher import PhraseMatcher
#Function to read resumes from the folder one by one
mypath='C:\path_to_resumes\' #enter your path here where you saved the resumes
onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
def pdfextract(file):
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
countpage = fileReader.getNumPages()
count = 0
text = []
while count < countpage:    
pageObj = fileReader.getPage(count)
count +=1
t = pageObj.extractText()
print (t)
text.append(t)
return text
#function to read resume ends

#function that does phrase matching and builds a candidate profile
def create_profile(file):
text = pdfextract(file) 
text = str(text)
text = text.replace("\n", "")
text = text.lower()
#below is the csv where we have all the keywords, you can customize your own
keyword_dict = pd.read_csv('D:/NLP_Resume/resume/template_new.csv')
stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)]
NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)]
ML_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)]
DL_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)]
R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)]
python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)]
Data_Engineering_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)]
matcher = PhraseMatcher(nlp.vocab)
matcher.add('Stats', None, *stats_words)
matcher.add('NLP', None, *NLP_words)
matcher.add('ML', None, *ML_words)
matcher.add('DL', None, *DL_words)
matcher.add('R', None, *R_words)
matcher.add('Python', None, *python_words)
matcher.add('DE', None, *Data_Engineering_words)
doc = nlp(text)
d = []  
matches = matcher(doc)
for match_id, start, end in matches:
rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
span = doc[start : end]  # get the matched slice of the doc
d.append((rule_id, span.text))      
keywords = "n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items())
## convertimg string of keywords to dataframe
df = pd.read_csv(StringIO(keywords),names = ['Keywords_List'])
df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword'])
df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count'])
df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1) 
df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))
base = os.path.basename(file)
filename = os.path.splitext(base)[0]
name = filename.split('_')
name2 = name[0]
name2 = name2.lower()
## converting str to dataframe
name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name'])
dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1)
dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True)
return(dataf)
#function ends
#code to execute/call the above functions
final_database=pd.DataFrame()
i = 0 
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)

#code to count words under each category and visulaize it through Matplotlib
final_database2 = final_database['Keyword'].groupby([final_database['Candidate Name'], final_database['Subject']]).count().unstack()
final_database2.reset_index(inplace = True)
final_database2.fillna(0,inplace=True)
new_data = final_database2.iloc[:,1:]
new_data.index = final_database2['Candidate Name']
#execute the below line if you want to see the candidate profile in a csv format
#sample2=new_data.to_csv('sample.csv')
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
ax = new_data.plot.barh(title="Resume keywords by category", legend=False, figsize=(25,7), stacked=True)
labels = []
for j in new_data.columns:
for i in new_data.index:
label = str(j)+": " + str(new_data.loc[i][j])
labels.append(label)
patches = ax.patches
for label, rect in zip(labels, patches):
width = rect.get_width()
if width > 0:
x = rect.get_x()
y = rect.get_y()
height = rect.get_height()
ax.text(x + width/2., y + height/2., label, ha='center', va='center')
plt.show()

在文件夹中,我有".doc"one_answers".docx"文件。在这之前,一切似乎都很好,就在下面。当我到达这里时,代码抛出一个错误。这是麻烦的代码。奇怪的是,它看起来像是某种PDF错误,但我只遍历".doc"one_answers".docx"文件。

final_database=pd.DataFrame()
i = 0 
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)

这是StackTrace:

Traceback (most recent call last):
File "<ipython-input-2-c63fca79d39f>", line 5, in <module>
dat = create_profile(file)
File "<ipython-input-1-cdc3bf75cd26>", line 34, in create_profile
text = pdfextract(file)
File "<ipython-input-1-cdc3bf75cd26>", line 17, in pdfextract
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
File "C:UsersryansAnaconda3libsite-packagesPyPDF2pdf.py", line 1084, in __init__
self.read(stream)
File "C:UsersryansAnaconda3libsite-packagesPyPDF2pdf.py", line 1696, in read
raise utils.PdfReadError("EOF marker not found")
PdfReadError: EOF marker not found

代码来自这里。

https://towardsdatascience.com/do-the-keywords-in-your-resume-aptly-represent-what-type-of-data-scientist-you-are-59134105ba0d

您使用的是PyPDF2包,它用于读取和操作pdf文件。在你提到的来自towardstascience的文章中,作者正在写的所有简历都是pdf格式的。

也许如果你的简历是doc/docx格式的,你应该探索python docx库:https://python-docx.readthedocs.io/en/latest/index.html

最新更新