如何使用Python拆分PDF,每个页面都包含一组特定的唯一文本



我有一个很大的PDF文件,需要将其拆分为每个"X"页,但"X"的位置可能会有所不同。我需要它来拆分每个页面,其中页面包含文本"名称:",但"名称:"之后的文本发生更改。。。

因此,第1页可能有"名称:Sachin",第2页也可能有"姓名:Sachin",但第3页有"名称∶Sarah",因此它应该从第1页到第2页,然后从第3页拆分。

这是我找到的一个脚本,只是它在每一页上都会拆分,不管怎样。

https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-pdfs-with-python/

提前感谢

Sachin

更新

以下是一些代码,它可以对每个页面进行拆分,但在找到文本"name:"后检测名称,并相应地重命名拆分文件,文件名中包含该名称。

我如何更新代码,以便如果发现两个连续的同名页面(在文本字段"名称:"之后(,它不会在该页面上拆分,而是将两个同名页面合并到一个pdf文件中?

再次感谢

Sachin

import os
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_file_path = 'Payslips.pdf'
file_base_name = pdf_file_path.replace('.pdf', '')
output_folder_path = os.path.join(os.getcwd(), 'Output')
pdf = PdfFileReader(pdf_file_path)
for page_num in range(pdf.numPages):
# Setup Objects & Classes
pdfWriter = PdfFileWriter()
pageObj = pdf.getPage(page_num)
pdfWriter.addPage(pageObj)
# Extract Text
Text = pageObj.extractText() 
# print(Text)
MatchedTextArray = re.findall("Name:[^0-9]+?s", Text)
MatchedText = (MatchedTextArray[0].replace('Name:', '')).replace('n', '')

# Splitting on UpperCase
res_pos = [i for i, e in enumerate(MatchedText+'A') if e.isupper()]
res_list = [MatchedText[res_pos[j]:res_pos[j + 1]]
for j in range(len(res_pos)-1)]
# Extracting Firstname
firstname = res_list[1]
# Extracting Surname
del res_list[0:2]
surname = ''.join(res_list)

with open(os.path.join(output_folder_path, 
'{0}, {1} - {2}.pdf'.format(surname.upper(), firstname.upper(), file_base_name.upper())), 
'wb') as f:
pdfWriter.write(f)
f.close()
print("Split Page " + str(page_num)) 

这样的东西应该可以工作:

import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_splitter(path):
fname = os.path.splitext(os.path.basename(path))[0]
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = '{}_page_{}.pdf'.format(
fname, page+1)
if not your_condition: # only write of condition isn't met (anymore)
with open("Give_it_a_name.txt", 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format("Give_it_a_name.txt"))
if __name__ == '__main__':
path = 'w9.pdf'
pdf_splitter(path)

好吧,我想我解决了它:

import os
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_file_path = 'Payslips.pdf'
file_base_name = pdf_file_path.replace('.pdf', '')
output_folder_path = os.path.join(os.getcwd(), 'Output')
pdf = PdfFileReader(pdf_file_path)
# Split Files
count = 0
for page_num in range(pdf.numPages):
# Skip Parent Loop if needed
if count > 0:
count -= count
continue

# Setup Objects & Classes
pdfWriter = PdfFileWriter()
pageObj = pdf.getPage(page_num)
pdfWriter.addPage(pageObj)
# Search on Current Page
Text = pageObj.extractText() 
MatchedTextArray = re.findall("Name:[^0-9]+?s", Text)
MatchedText = (MatchedTextArray[0].replace('Name:', '')).replace('n', '')
# Search on following Pages
i = page_num + 1
while i < pdf.numPages:
pageObjNext = pdf.getPage(i)
TextNext = pageObjNext.extractText() 
MatchedTextArrayNext = re.findall("Name:[^0-9]+?s", TextNext)
MatchedTextNext = (MatchedTextArrayNext[0].replace('Name:', '')).replace('n', '')
if MatchedText == MatchedTextNext:
i += 1
count += 1
pdfWriter.addPage(pageObjNext)
else:
break
# Splitting on UpperCase
res_pos = [i for i, e in enumerate(MatchedText+'A') if e.isupper()]
res_list = [MatchedText[res_pos[j]:res_pos[j + 1]] for j in range(len(res_pos)-1)]
# Extracting Firstname
firstname = res_list[1]
# Extracting Surname
surname = ''
del res_list[0:2]
if len(res_list) == 1:
surname = surname + res_list[0]
else:
surname = surname + res_list[0]
for i in (n+1 for n in range(len(res_list)-1)):
if res_list[i-1][-1] == "-" or res_list[i-1][-1] == "'" :
surname = surname + res_list[i]
else:
surname = surname + " " + res_list[i]

# Write PDF File
with open(os.path.join(output_folder_path, 
'{0}, {1}'.format(surname.upper(), firstname.upper())), 'wb') as f:
pdfWriter.write(f)
f.close()
# Rename Files in Output Directory
files = os.listdir(output_folder_path)
for file in files:
os.rename(os.path.join(output_folder_path, file), 
os.path.join(output_folder_path, 'WE 25JAN 2022 - ' + file + ' - PAYSLIP' + '.pdf'))

相关内容

最新更新