我随机在刮美<h1>汤时和<Li>刮时没有输出


from urllib import response
from bs4 import BeautifulSoup
import requests
import csv
# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
# }
csv_file = open("scifi_audible.csv", "w")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["title ", "link ", "rating ", "reviews "])
url = "https://www.audible.de/"
audiobooklinks = []
for x in range(1, 2):
source = requests.get(f"https://www.audible.de/search?node=16245852031&page={x}")
soup = BeautifulSoup(source.content, "lxml")
audiobooks = soup.find_all("h3", class_ = "bc-heading")
for item in audiobooks:
for link in item.find_all("a", href=True):
audiobooklinks.append(url + link["href"])

#testlink = 'https://www.audible.de/pd/Mortarion-The-Pale-King-Hoerbuch/B0BCQXVJML'
for link in audiobooklinks:
r = requests.get(link) #headers=headers)
soup = BeautifulSoup(r.content, "lxml")

try:
title = soup.find("h1", class_= "bc-heading").text.strip()
except:
title = "no output possible"
try:
rating = soup.find("span", attrs={"aria-hidden":"true", "class":"bc-text"}).text.strip()
except:
rating = "no rating"
try:
raw_reviews = soup.find("li", class_= "bc-list-item ratingsLabel").text.strip()
except:
raw_reviews = "no raw_reviews"
try:
reviews = raw_reviews.split("(")[-1].split()[0].replace(".", "")
except:
reviews = "no reviews"
print(title, link, rating, reviews)
csv_writer.writerow([title, link, rating, reviews])

csv_file.close()

大多数时候它是有效的。随机打印,例如:

"无法输出https://www.audible.de//pd/Mortarion-The-Pale-King-Hoerbuch/B0BCQXVJML无评级无;

我必须改变什么才能总是得到h1和li?

当我试图复制"不可能输出">场景,我大约有2%的时间得到它;它几乎总是由于503服务不可用错误;偶尔,状态是200[可以],但内容是空的——我真的不知道是什么原因造成的。(顺便说一句,在处理请求时,通常应该在继续之前检查status_code==200。(

处理这种情况的一种方法是附加,类似

repeats = 0  # initiate
maxRepeats = 10 # limit allowed errors
abl_copy = audiobooklinks[:] # preserve original
for link in abl_copy:
r = requests.get(link) #headers=headers)
soup = BeautifulSoup(r.content, "lxml")
if r.status_code != 200 or not r.content:
print(f'! {r.status_code} {r.reason} - for {link} !')
repeats += 1
if maxRepeats < repeats: 
print('! Stopping because of too many bad responses !')
break
abl_copy.append(link)
continue

soup = BeautifulSoup(r.content, "lxml")
# rest of your for loop

还有很多其他方法可以处理它——你可以在每次得到503时添加一个等待,你可以将所有坏响应[r]添加到列表中,并在循环后探索/处理它们,等等

最新更新