将PDF文件转换为.txt-python3


from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
filepath = open(fname, 'rb')
for page in PDFPage.get_pages(filepath, pagenums):
interpreter.process_page(page)
filepath.close()
converter.close()
text = output.getvalue()
output.close
return text 
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\" #if no pdfDir passed in 
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf 
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
#textFile.close
pdfDir = (r"FK_EPPS")
txtDir = (r"FK_txt")
convertMultiple(pdfDir, txtDir)

我尝试将多个名为FK_EPPS的pdf文件转换为txt文件,并将其写入名为FK_txt的不同文件夹中。但它说没有这样的文件或目录。我把文件夹正好放在那个路径上。我试图找到解决方案,但仍然存在错误。你能帮我为什么会发生这种事吗?

/usr/local/lib/python2.7/dist-packages/pdfminer/__init__.py:20: UserWarning: On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For more information see https://github.com/pdfminer/pdfminer.six/issues/194
warnings.warn('On January 1st, 2020, pdfminer.six will stop supporting Python 2. Please upgrade to Python 3. For '
Traceback (most recent call last):
File "/home/a1-re/Documents/pdftotext/1.py", line 44, in <module>
convertMultiple(pdfDir, txtDir)
File "/home/a1-re/Documents/pdftotext/1.py", line 36, in convertMultiple
text = convert(pdfFilename) #get string of text content of pdf
File "/home/a1-re/Documents/pdftotext/1.py", line 21, in convert
filepath = file(fname, 'rb')
IOError: [Errno 2] No such file or directory: 'pdf1831150030.pdf'

(您显示的回溯不可能是正确的。对于您的样本输入,错误应该在开始时包含FK_EPPS。(

您忘记了路径和文件名必须使用适用于操作系统的适当分隔符进行分隔。

如果您在convert函数开始时打印出fname的值,您可能会立即看到这一点。您对文本输出文件名也犯了同样的错误,但这将更难注意到,因为它不会产生错误,只会创建错误的文件名。

相关内容

  • 没有找到相关文章

最新更新