从pdf列表中提取特定页面并生成新的pdf



我一直在尝试从每个pdf中提取特定页面,然后合并所有提取的pdf。

我的pdf文档列表

我正在使用pdfrw这个库,但在提取页面时出现错误

from pdfrw import PdfReader, PdfWriter
import os
files = [f for f in os.listdir(
'.') if os.path.isfile(f) and f.endswith('.pdf')]
print(files)

for pdf in files:
pages = PdfReader(pdf).pages
parts = [(6, 7)]
for part in parts:
title = pdf.title().split('.')[0]
outdata = PdfWriter(f'{title}_{part[0]}_.pdf')
for pagenum in range(*part):
outdata.addpage(pages[pagenum-1])
outdata.write()

如果可能请帮忙

raise PdfParseError('Invalid PDF header: %s' %
pdfrw.errors.PdfParseError: Invalid PDF header: '<!doctype html>'

Manas,

实现需求的一种方法是使用API。例如,考虑以下代码片段,它将PDF从上传的文件中分离出来。

import os
import requests # pip install requests
# The authentication key (API Key).
# Get your own by registering at https://app.pdf.co
API_KEY = "*********************************"
# Base URL for PDF.co Web API requests
BASE_URL = "https://api.pdf.co/v1"
# Source PDF file
SourceFile = ".\sample.pdf"
# Comma-separated list of page numbers (or ranges) to process. Example: '1,3-5,7-'.
Pages = "1-2,3-"

def main(args = None):
uploadedFileUrl = uploadFile(SourceFile)
if (uploadedFileUrl != None):
splitPDF(uploadedFileUrl)

def splitPDF(uploadedFileUrl):
"""Split PDF using PDF.co Web API"""
# Prepare requests params as JSON
# See documentation: https://apidocs.pdf.co
parameters = {}
parameters["pages"] = Pages
parameters["url"] = uploadedFileUrl
# Prepare URL for 'Split PDF' API request
url = "{}/pdf/split".format(BASE_URL)
# Execute request and get response as JSON
response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# Download generated PNG files
part = 1
for resultFileUrl in json["urls"]:
# Download Result File
r = requests.get(resultFileUrl, stream=True)
localFileUrl = f"Page{part}.pdf"
if r.status_code == 200:
with open(localFileUrl, 'wb') as file:
for chunk in r:
file.write(chunk)
print(f"Result file saved as "{localFileUrl}" file.")
else:
print(f"Request error: {response.status_code} {response.reason}")
part = part + 1
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")

def uploadFile(fileName):
"""Uploads file to the cloud"""

# 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
# Prepare URL for 'Get Presigned URL' API request
url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
BASE_URL, os.path.basename(fileName))

# Execute request and get response as JSON
response = requests.get(url, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()

if json["error"] == False:
# URL to use for file upload
uploadUrl = json["presignedUrl"]
# URL for future reference
uploadedFileUrl = json["url"]
# 2. UPLOAD FILE TO CLOUD.
with open(fileName, 'rb') as file:
requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })
return uploadedFileUrl
else:
# Show service reported error
print(json["message"])    
else:
print(f"Request error: {response.status_code} {response.reason}")
return None

if __name__ == '__main__':
main()

现在,要合并PDF文件,您可以使用类似于下面的代码片段:

import os
import requests # pip install requests
# The authentication key (API Key).
# Get your own by registering at https://app.pdf.co
API_KEY = "**********************************"
# Base URL for PDF.co Web API requests
BASE_URL = "https://api.pdf.co/v1"
# Source PDF files
SourceFile_1 = ".\sample1.pdf"
SourceFile_2 = ".\sample2.pdf"
# Destination PDF file name
DestinationFile = ".\result.pdf"
def main(args = None):
UploadedFileUrl_1 = uploadFile(SourceFile_1)
UploadedFileUrl_2 = uploadFile(SourceFile_2)
if (UploadedFileUrl_1 != None and UploadedFileUrl_2!= None):
uploadedFileUrls = "{},{}".format(UploadedFileUrl_1, UploadedFileUrl_2)
mergeFiles(uploadedFileUrls, DestinationFile)
def mergeFiles(uploadedFileUrls, destinationFile):
"""Perform Merge using PDF.co Web API"""
# Prepare requests params as JSON
# See documentation: https://apidocs.pdf.co
parameters = {}
parameters["name"] = os.path.basename(destinationFile)
parameters["url"] = uploadedFileUrls
# Prepare URL for 'Merge PDF' API request
url = "{}/pdf/merge".format(BASE_URL)
# Execute request and get response as JSON
response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
#  Get URL of result file
resultFileUrl = json["url"]            
# Download result file
r = requests.get(resultFileUrl, stream=True)
if (r.status_code == 200):
with open(destinationFile, 'wb') as file:
for chunk in r:
file.write(chunk)
print(f"Result file saved as "{destinationFile}" file.")
else:
print(f"Request error: {response.status_code} {response.reason}")
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")

def uploadFile(fileName):
"""Uploads file to the cloud"""

# 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
# Prepare URL for 'Get Presigned URL' API request
url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
BASE_URL, os.path.basename(fileName))

# Execute request and get response as JSON
response = requests.get(url, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()

if json["error"] == False:
# URL to use for file upload
uploadUrl = json["presignedUrl"]
# URL for future reference
uploadedFileUrl = json["url"]
# 2. UPLOAD FILE TO CLOUD.
with open(fileName, 'rb') as file:
requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })
return uploadedFileUrl
else:
# Show service reported error
print(json["message"])    
else:
print(f"Request error: {response.status_code} {response.reason}")
return None

if __name__ == '__main__':
main()

在这个例子中,我使用pdf。公司的API。更多信息请参考以下链接。

https://apidocs.pdf。有限公司/30-pdf-split https://apidocs.pdf.co/31-pdf-merge

谢谢!

最新更新