我已经设置了一个python脚本来从网站下载PDF,但是,该代码在一个时间点成功运行,然后当我在另一个时间点运行相同的代码时,说几个小时后,会引发以下错误:
valueerror:无json对象可以解码。
我之所以假设它是因为我将邮政请求发送到的服务器不会发送所需的响应,并且当我尝试将JSON响应转换为python对象时,它会引发错误。
仅显示代码的相关部分。"..."
表示代码的省略部分:
class BupaScraper(object):
def __init__(self):
self.search_request = {
"QuoteCriteria":{
"FamilyType":"Single",
"State":"NSW",
"DateOfBirth":"1993-04-24T00:00:00",
"PartnerDateOfBirth":None,
"Frequency":"WEEKLY",
"IncomeTier":"base tier",
"IncomeTierCombined":None,
"IsFromPrice":True,
"HasEligibleChild":None,
"IsCoupleOrFamily":False,
"Age":25},
"Excess":100,
"PackageEntityName":"active-saver",
"ProductType":1
}
self.all_covers = {
"QuoteCriteria":{
"FamilyType":"Single",
"State":"NSW",
"DateOfBirth":"1993-04-24T00:00:00",
"PartnerDateOfBirth":None,
"Frequency":"WEEKLY",
"IncomeTier":"base tier",
"IncomeTierCombined":None,
"IsFromPrice":True,
"HasEligibleChild":None,
"IsCoupleOrFamily":False,
"Age":25},
"HealthNeedFilter":""
}
self.header = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:60.0) Gecko/20100101 Firefox/60.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.bupa.com.au/health-insurance/cover/active-saver',
'Content-Type': 'application/json',
'Content-Length': '330'
}
def get_product_names_singles(self):
#scrape product names
combined_packages = []
hospital_packages = []
extras_packages = []
post_url = 'https://www.bupa.com.au/api/cover/search'
#Singles
self.all_covers['QuoteCriteria']['FamilyType'] = 'Single'
self.header['Referer'] = 'https://www.bupa.com.au/health-insurance/cover/singles'
payload = json.dumps(self.all_covers)
r = requests.post(post_url, data=payload, headers=self.header)
output = r.json()
#output = self.post_request(post_url,payload)
package_names = []
for item in output['Results']:
for entity in item:
if entity == 'EntityName' and item['EntityName'] not in package_names:
package_names.append(item['EntityName'])
for prod_type in package_names:
if 'hospital' not in prod_type and 'extras' not in prod_type:
combined_packages.append(prod_type)
elif 'extras' not in prod_type:
hospital_packages.append(prod_type)
elif 'hospital' not in prod_type:
extras_packages.append(prod_type)
singles_products = combined_packages + hospital_packages + extras_packages
print singles_products
#return all packages
return singles_products
...
def post_request(self,url,data):
self.data = data
self.url = url
req = urllib2.Request(self.url)
req.add_header('Content-Type', 'application/json')
res = urllib2.urlopen(req,self.data)
out = json.load(res)
return out
def get_pdf(self):
link ='https://www.bupa.com.au/api/cover/datasheets/search'
directory = '/Users/U1085012/OneDrive/PDS data project/Bupa/PDS Files/'
excess = [None, 0,50,100,500]
#singles
for product in get_product_names_singles():
self.search_request['PackageEntityName'] = product
print product
if 'extras' in product:
self.search_request['ProductType'] = 2
else:
self.search_request['ProductType'] = 1
for i in range(len(excess)):
try:
self.search_request['Excess'] = excess[i]
payload = json.dumps(self.search_request)
output = self.post_request(link,payload)
except urllib2.HTTPError:
continue
else:
break
path = output['FilePath'].encode('ascii')
file_name = output['FileName'].encode('ascii')
#check to see if file exists if not then retrieve
if os.path.exists(directory+file_name):
pass
else:
ul.urlretrieve(path, directory+file_name)
...
如何使代码更加强大,以使其一直运行?
发送请求后尝试立即添加时间睡眠,有时您的请求的响应有效负载只是不完整,然后当您尝试在JSON对象中转换时,它会引发错误
import time
...
r = requests.post(post_url, data=payload, headers=self.header)
time.sleep(2)
output = r.json()
#output = self.post_request(post_url,payload)
package_names = []
您可能想尝试的另一件事是用"尝试"以"尝试除外"的JSON转换器行。语句,脚本丢弃时不要停止:
import time
...
r = requests.post(post_url, data=payload, headers=self.header)
time.sleep(2)
try:
output = r.json()
except:
print('response error: ' + r.status_code)
return False
package_names = []
虽然我非常感谢给出的帮助,并且我将这些建议纳入了我的代码中,但我相信我无法废除该网站的实际原因是因为我的IP是暂时阻塞一段时间(看起来大约12小时(。这是我在脚本开发过程中不断访问该网站的结果(现在已经完成(和网站反刮擦政策。显然,我可以采取一些措施来克服这些措施,例如旋转IP地址,旋转用户代理ID等,但我不想进入灰色区域当我需要的信息更新时,需要每年删除本网站三到四次。