使用Python从多个JSON url下载pdf



我的任务是创建一个从JSON文件中包含的url下载多个pdf的方法。每个JSON文件可能有一个URL,每批处理大约500k个JSON文件。

下面是JSON文件的示例:

{
"from": null,
"id": "sfm_c4kjatol7u8psvqfati0",
"imb_code": "897714123456789",
"mail_date": null,
"mail_type": "usps_first_class",
"object": "self_mailer",
"press_proof": "https://lob-assets.com/sid-self_mailers/sfm_c4kjatol7u8psvqfati0.pdf?version=v1&expires=1635274615&signature=AZlb0MSzZPuCjtKFkXRr_OoHzDzEy23UqzmKFWs5bycKCEcIyfe2od58zHzfP1a-iW5d9azFYUT1PnosqKcvBg",
"size": "11x9_bifold",
"target_delivery_date": null,
"to": {
"address_city": "SAN FRANCISCO",
"address_country": "UNITED STATES",
"address_line1": "185 BERRY ST STE 6100",
"address_line2": null,
"address_state": "CA",
"address_zip": "94107-1741",
"company": "Name.COM",
"name": "EMILE ILES"
}
}

将JSON文件转换为CSV文件并下载URL。

这是我一直在尝试使用,但它不工作。我错过了什么?

Import urllib.request, json, requests, os, csvkit

from itertools import islice
from pathlib import Path
path = Path("/Users/MyComputer/Desktop/self_mailers")
paths = [i.path for i in islice(os.scandir(path), 100)]
in2csv data.json > data.csv
with open('*.json', 'r') as f:
urls_dict = json.load(f)
urls_dict = urls_dict[0]
itr = iter(urls_dict)
len(list(itr))
f.write(r.pdf)

为什么要将JSON转换为CSV?顺便说一句,如果您不确定url在json中的位置,我会这样做:

import os
import json
from rethreader import Rethreader
from urllib.parse import urlparse
from urllib.request import urlretrieve

def download_pdf(url):
# use urlparse to find the pdf name
filename = urlparse(url).path.rsplit('/')[-1]
urlretrieve(url, filename)

# use multi-threading for faster downloads
downloader = Rethreader(download_pdf).start()

def verify_url(value):
if not isinstance(value, str):
# if the value is not a string, it's neither an url
return False
try:
parsed_url = urlparse(value)
except AttributeError:
# value cannot be parsed as url
return False
if not (parsed_url.scheme and parsed_url.netloc and parsed_url.path):
# value cannot be an url because it does not have the right scheme
return False
return True

def parse_data(data):
for value in data.values():
if verify_url(value):
downloader.add(value)

for file in os.listdir():
with open(file) as fp:
try:
json_data = json.load(fp)
except (json.JSONDecodeError, UnicodeDecodeError):
# this file is not a json; let's skip to the next one
continue
parse_data(json_data)

# quit the downloader after downloading the files
downloader.quit()

如果你知道哪些键可以是url,我会这样做:

# The other parts same as before
def parse_data(data):
for key in ['possible_key', 'another_possible_key']:
if key in data and verify_url(data[key]):
downloader.add(data[key])

最新更新