美丽汤网页抓取:未绑定本地错误:赋值前引用的局部变量"soup"



我尝试了用漂亮的汤和请求进行网络抓取,通过关注Youtube视频,一切都很顺利,直到我出现这个错误,但导师还是成功了。

import requests
from bs4 import BeautifulSoup
def get_data(url):
if not response.ok:
print('Server Responded: {}'.format(response.status_code))
else:
soup = BeautifulSoup(response.text, 'lxml')
return(soup)
def get_detail_data(soup):
try:
title = soup.find('h1', id='itemTitle').text.strip()
except:
title = ''

try:
p = soup.find('span', id='prcIsum').text.strip()
currency, price = p.split(' ')
except:
currency = ''
price = ''

try:
sold = soup.find('span', class_='vi-qtyS-hot-red').a.text.strip().split(' ')[0]
except:
sold = ''

data = {
'title' : title,
'currency' : currency,
'price' : price,
'total units sold' : sold
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a', class_='s-item__link')
except:
links = []

urls = [item.get('href') for item in links]
return urls
def main():
url = 'https://www.ebay.com/sch/i.html?_nkw=mens+shoes&_sacat=0'

products = get_index_data(get_data(url))
for link in products:
data =  get_detail_data(get_data(link))

if __name__ == '__main__':
main()

在get_data中,您缺少实际请求和响应存储。如果response.ok不为True,则需要指定汤=无。最后,在其他地方,在尝试调用方法之前,您需要测试汤是否为None

import requests
from bs4 import BeautifulSoup
def get_data(url):

response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'}) #this was missing

if not response.ok:
print('Server Responded: {}'.format(response.status_code))
soup = None
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):

try:
title = soup.find('h1', id='itemTitle').text.strip()
except:
title = ''

try:
p = soup.find('span', id='prcIsum').text.strip()
currency, price = p.split(' ')
except:
currency = ''
price = ''

try:
sold = soup.find('span', class_='vi-qtyS-hot-red').a.text.strip().split(' ')[0]
except:
sold = ''

data = {
'title' : title,
'currency' : currency,
'price' : price,
'total units sold' : sold
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a', class_='s-item__link')
except:
links = []

urls = [item.get('href') for item in links]
return urls
def main():

url = 'https://www.ebay.com/sch/i.html?_nkw=mens+shoes&_sacat=0'
soup = get_data(url)

if not soup is None:

products = get_index_data(soup)
#print(products)
for link in products:

soup = get_data(link)

if not soup is None:

data =  get_detail_data(soup)
print(data)
if __name__ == '__main__':
main()

最新更新