我创建了一个程序,该程序将在目录中包含的所有PDF文件中搜索单词或短语。如果在给定的PDF中找到短语,则包含该术语的页面将被提取并保存为新的PDF。
这个程序很慢。我需要运行1000多个PDF,所以使用多处理/courrent.futures来加快速度是非常有益的。然而,我似乎无法正常工作。
有没有一种直接的方法可以在下面的代码中启用多处理?
import PyPDF2
import re
import os
import glob
from pathlib import Path
String = input("Enter search string: ")
inputDir = Path(input("Enter path to directory containing PDFs to search: "))
outputDir = Path(input("Enter path to directory where you would like PDFs saved: "))
outputAppend = input("Text (including separator) to be appended to end of filenames (blank if none): ")
inputDir_glob = str(inputDir) + "/*.pdf"
PDFlist = sorted(glob.glob(inputDir_glob))
if not os.path.exists(str(outputDir)):
os.makedirs(str(outputDir))
for filename in PDFlist:
object = PyPDF2.PdfFileReader(filename, strict=False)
# Get number of pages in the pdf
NumPages = object.getNumPages()
# Setup the file writer
output = PyPDF2.PdfFileWriter()
# Do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String, Text):
print("File: " + filename + " | " + "Page: " + str(i))
output.addPage(object.getPage(i))
outputStream = open(str(outputDir) + "/" + os.path.splitext(os.path.basename(filename))[0] + outputAppend + ".pdf", "wb")
output.write(outputStream)
outputStream.close()
我最终弄清楚了这一点,并认为我会分享,以防其他人面临类似的问题。以下解决方案比原始代码(张贴在上面(快得多:
import PyPDF2
import re
import os
import glob
from pathlib import Path
import concurrent.futures
# Enter the search term here:
String = input("Enter search string: ")
#Enter directory containing original PDFs:
inputDir = Path(input("Enter path to directory containing PDFs to search: "))
outputDir = Path(input("Enter path to directory where you would like PDFs saved: "))
outputAppend = input("Text (including separator) to be appended to end of filenames (blank if none): ")
inputDir_glob = str(inputDir) + "/*.pdf"
PDFlist = sorted(glob.glob(inputDir_glob))
if not os.path.exists(str(outputDir)):
os.makedirs(str(outputDir))
def process_file(filename):
object = PyPDF2.PdfFileReader(filename, strict=False)
NumPages = object.getNumPages()
output = PyPDF2.PdfFileWriter()
# Do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String, Text):
print("File: " + filename + " | " + "Page: " + str(i))
output.addPage(object.getPage(i))
outputStream = open(str(outputDir) + "/" + os.path.splitext(os.path.basename(filename))[0] + outputAppend + ".pdf", "wb")
output.write(outputStream)
outputStream.close()
#os.rename(filename, Path(str(outputDir) + "/Originals/" + str(os.path.basename(filename))))
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(process_file, (PDFlist))