我一直在尝试从每个pdf中提取特定页面,然后合并所有提取的pdf。
我的pdf文档列表
我正在使用pdfrw
这个库,但在提取页面时出现错误
from pdfrw import PdfReader, PdfWriter
import os
files = [f for f in os.listdir(
'.') if os.path.isfile(f) and f.endswith('.pdf')]
print(files)
for pdf in files:
pages = PdfReader(pdf).pages
parts = [(6, 7)]
for part in parts:
title = pdf.title().split('.')[0]
outdata = PdfWriter(f'{title}_{part[0]}_.pdf')
for pagenum in range(*part):
outdata.addpage(pages[pagenum-1])
outdata.write()
如果可能请帮忙
raise PdfParseError('Invalid PDF header: %s' %
pdfrw.errors.PdfParseError: Invalid PDF header: '<!doctype html>'
Manas,
实现需求的一种方法是使用API。例如,考虑以下代码片段,它将PDF从上传的文件中分离出来。
import os
import requests # pip install requests
# The authentication key (API Key).
# Get your own by registering at https://app.pdf.co
API_KEY = "*********************************"
# Base URL for PDF.co Web API requests
BASE_URL = "https://api.pdf.co/v1"
# Source PDF file
SourceFile = ".\sample.pdf"
# Comma-separated list of page numbers (or ranges) to process. Example: '1,3-5,7-'.
Pages = "1-2,3-"
def main(args = None):
uploadedFileUrl = uploadFile(SourceFile)
if (uploadedFileUrl != None):
splitPDF(uploadedFileUrl)
def splitPDF(uploadedFileUrl):
"""Split PDF using PDF.co Web API"""
# Prepare requests params as JSON
# See documentation: https://apidocs.pdf.co
parameters = {}
parameters["pages"] = Pages
parameters["url"] = uploadedFileUrl
# Prepare URL for 'Split PDF' API request
url = "{}/pdf/split".format(BASE_URL)
# Execute request and get response as JSON
response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# Download generated PNG files
part = 1
for resultFileUrl in json["urls"]:
# Download Result File
r = requests.get(resultFileUrl, stream=True)
localFileUrl = f"Page{part}.pdf"
if r.status_code == 200:
with open(localFileUrl, 'wb') as file:
for chunk in r:
file.write(chunk)
print(f"Result file saved as "{localFileUrl}" file.")
else:
print(f"Request error: {response.status_code} {response.reason}")
part = part + 1
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
def uploadFile(fileName):
"""Uploads file to the cloud"""
# 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
# Prepare URL for 'Get Presigned URL' API request
url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
BASE_URL, os.path.basename(fileName))
# Execute request and get response as JSON
response = requests.get(url, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# URL to use for file upload
uploadUrl = json["presignedUrl"]
# URL for future reference
uploadedFileUrl = json["url"]
# 2. UPLOAD FILE TO CLOUD.
with open(fileName, 'rb') as file:
requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })
return uploadedFileUrl
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
return None
if __name__ == '__main__':
main()
现在,要合并PDF文件,您可以使用类似于下面的代码片段:
import os
import requests # pip install requests
# The authentication key (API Key).
# Get your own by registering at https://app.pdf.co
API_KEY = "**********************************"
# Base URL for PDF.co Web API requests
BASE_URL = "https://api.pdf.co/v1"
# Source PDF files
SourceFile_1 = ".\sample1.pdf"
SourceFile_2 = ".\sample2.pdf"
# Destination PDF file name
DestinationFile = ".\result.pdf"
def main(args = None):
UploadedFileUrl_1 = uploadFile(SourceFile_1)
UploadedFileUrl_2 = uploadFile(SourceFile_2)
if (UploadedFileUrl_1 != None and UploadedFileUrl_2!= None):
uploadedFileUrls = "{},{}".format(UploadedFileUrl_1, UploadedFileUrl_2)
mergeFiles(uploadedFileUrls, DestinationFile)
def mergeFiles(uploadedFileUrls, destinationFile):
"""Perform Merge using PDF.co Web API"""
# Prepare requests params as JSON
# See documentation: https://apidocs.pdf.co
parameters = {}
parameters["name"] = os.path.basename(destinationFile)
parameters["url"] = uploadedFileUrls
# Prepare URL for 'Merge PDF' API request
url = "{}/pdf/merge".format(BASE_URL)
# Execute request and get response as JSON
response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# Get URL of result file
resultFileUrl = json["url"]
# Download result file
r = requests.get(resultFileUrl, stream=True)
if (r.status_code == 200):
with open(destinationFile, 'wb') as file:
for chunk in r:
file.write(chunk)
print(f"Result file saved as "{destinationFile}" file.")
else:
print(f"Request error: {response.status_code} {response.reason}")
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
def uploadFile(fileName):
"""Uploads file to the cloud"""
# 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
# Prepare URL for 'Get Presigned URL' API request
url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
BASE_URL, os.path.basename(fileName))
# Execute request and get response as JSON
response = requests.get(url, headers={ "x-api-key": API_KEY })
if (response.status_code == 200):
json = response.json()
if json["error"] == False:
# URL to use for file upload
uploadUrl = json["presignedUrl"]
# URL for future reference
uploadedFileUrl = json["url"]
# 2. UPLOAD FILE TO CLOUD.
with open(fileName, 'rb') as file:
requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })
return uploadedFileUrl
else:
# Show service reported error
print(json["message"])
else:
print(f"Request error: {response.status_code} {response.reason}")
return None
if __name__ == '__main__':
main()
在这个例子中,我使用pdf。公司的API。更多信息请参考以下链接。
https://apidocs.pdf。有限公司/30-pdf-split https://apidocs.pdf.co/31-pdf-merge
谢谢!