使用python web抓取获取UnboundLocalError



我为亚马逊网络抓取写了一个代码,我的代码适用于所有亚马逊产品,但它确实适用于一些我不知道的特定产品,它为一些产品显示的错误是UnboundLocalError:在分配之前引用了本地变量'reviews',你能指导我如何解决这个问题吗?谢谢!这是代码:

import requests 
from bs4 import BeautifulSoup as bs 
import json 
import random 
import os.path
import time
import pandas as pd
def scrape_products(response):
dataframe = pd.DataFrame()
res = pd.DataFrame()
# Checking if response is okay or not
if response.ok:
response = response.text
content = bs(response,'lxml')
# Selecting products
items = content.find_all('li' , class_ = 'zg-item-immersion')
for item in items:            
# Selecting data about each product
count = item.find('span' , class_ = 'zg-badge-text').text.strip()
title = item.find('div' , class_ = 'p13n-sc-truncate').text.strip()
price = item.find('span' , class_ = 'p13n-sc-price')
try:
rating = item.find('span' , class_ = 'a-icon-alt').text.strip()
total = item.find('a' , class_ = 'a-size-small a-link-normal').text.strip()
reviews = item.find('div' , class_ = 'a-icon-row a-spacing-none').find('a',class_='a-link-normal').get('href')
except:
pass
image_url = item.find('div' , class_ = 'a-section a-spacing-small').find('img').get('src')
go_to = item.find('span', class_ = 'a-list-item').find('a' , class_ = 'a-link-normal').get('href')
product_url = 'https://www.amazon.com' + go_to
#        product_url = product_url.replace('?','/ref=zg_bs_2399939011_1?')

reviews_url = 'https://www.amazon.com' + reviews
#         desc = requests.get(product_url)
#         print(desc.status_code)
#         if desc.ok:
#             desc = desc.text
#             data = bs(desc,'lxml')
#             old_price = data.find('span',class_='priceBlockStrikePriceString a-text-strike').text
#             if(old_price):
#                 print(old_price)
#             else:
#                 pass
print('************************************************ ' + count +' **********************************************')
print('Title: {}'.format(title + 'n'))
if(price):
price = price.text.strip()
price = price[1:]
print('Price: {}'.format(price))
else:
pass
print('Rating: {} ({})'.format(rating , total))
print('Reviews Url: {}'.format(reviews_url))
print('Image Url: {}'.format(image_url))
print('Product Url: {}'.format(product_url))

print()
print()
data = {'Title':[title], 'Price':[price], 'Rating':[str(rating) +'('+ str(total)+')'],
'Reviews Url':[reviews_url], 'Image Url': [image_url], 'Product Url':[product_url]}
df = pd.DataFrame(data)
dataframe = dataframe.append(df)
return dataframe

def main():
page_1 = 'https://www.amazon.com/Best-Sellers-Amazon-Device-Smart-Locks/zgbs/amazon-devices/17295887011/ref=zg_bs_nav_3_5499877011'
#page_2 = 'https://www.amazon.com/Best-Sellers-Fire-Tablets-Bundles/zgbs/amazon-devices/17142718011/ref=zg_bs_pg_1?_encoding=UTF8&pg=2'
response_1 = requests.get(page_1)
#response_2 = requests.get(page_2)

df1 = scrape_products(response_1)
#df2 = scrape_products(response_2)
#df = df1.append(df2)
df1.Price = df1.Price.astype(float)
df1 = df1.sort_values(by=['Price'])
df1.to_csv('AMAZONAmazon Devices & AccessoriesAmazon Device AccessoriesBest Sellers in Amazon Device Smart Locks.csv',index=False)

if __name__== "__main__":
main()

您在try/except块中设置reviews,该块只会忽略错误。。。。并且不设置CCD_ 2。在这种情况下,reviews_url = 'https://www.amazon.com' + reviews在分配之前引用reviews。更糟糕的是,ratingtotalreviews可能是来自上一个循环的陈旧数据,从而无声地输出坏数据。您需要一个处理错误的策略,例如跳过该项目

try:
rating = item.find('span' , class_ = 'a-icon-alt').text.strip()
total = item.find('a' , class_ = 'a-size-small a-link-normal').text.strip()
reviews = item.find('div' , class_ = 'a-icon-row a-spacing-none').find('a',class_='a-link-normal').get('href')
except:
print("ERROR scanning {}, ignored".format(item))
import traceback
traceback.print_exc()
continue

抓住赤裸裸的例外是个坏主意。它屏蔽了错误以及预期的错误。当你打印出错误时,你会感觉到哪些事情可以忽略,并将异常处理程序更新为

except (IndexError, ValueError) as e:
....

最新更新