url2 = 'https://www.newegg.ca/Desktop-Graphics-Cards/SubCategory/ID-48?Tid=7708'
# opening up connection, grabbing page
uclient = ureq(url2)
html = uclient.read()
uclient.close()
# html parsing
page_soup = soup(html, "html.parser")
#grabs each product
containers = page_soup.findAll("div",{"class":"item-container"})
print(containers[0].div.div.a.img["title"])
for container in containers:
brand = container.div.div.a.img["title"]
title_container = container.findAll("a", {"class":"item-title"})
product_name = title_container[0].text
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
print(brand)
print(product_name)
print(shipping)
在for循环brand = container.div.div.a.img["title"]
期间出现问题它给出了一个错误"NoneType"对象不可下标。奇怪的是,我可以访问这个标题,甚至可以在循环print(containers[0].div.div.a.img["title"])
之外打印它。请帮我弄清楚这里发生了什么。谢谢,祝你一切顺利!
使用
for number, container in enumerate(containers):
print("---", number, "---")
# ... code ...
我发现它只会给containers[15]
带来问题。
您应该使用if/else
来检查container.div.div.a.img
是否给出None
,并跳过此元素或设置一些默认文本。
if container.div.div.a.img:
brand = container.div.div.a.img["title"]
else:
brand = "???"
完整的工作代码
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
url2 = 'https://www.newegg.ca/Desktop-Graphics-Cards/SubCategory/ID-48?Tid=7708'
# opening up connection, grabbing page
uclient = ureq(url2)
html = uclient.read()
uclient.close()
# html parsing
page_soup = soup(html, "html.parser")
#grabs each product
containers = page_soup.findAll("div", {"class":"item-container"})
#print(containers[15].div.div.a.img["title"])
for number, container in enumerate(containers):
print("---", number, "---")
if container.div.div.a.img:
brand = container.div.div.a.img["title"]
else:
brand = "???"
title_container = container.findAll("a", {"class": "item-title"})
product_name = title_container[0].text
shipping_container = container.findAll("li", {"class": "price-ship"})
shipping = shipping_container[0].text.strip()
print(brand)
print(product_name)
print(shipping)
编辑:
我查看网页看到这个containers[15]
,它有额外的div
和文本#1 BEST SELLER
,这就造成了问题。它需要不同的方法来获得它——即
brand = container.div.find("img", {"title": True})["title"]
你可以使用它,甚至与所有容器
for number, container in enumerate(containers):
print("---", number, "---")
#if container.div.div.a.img:
# brand = container.div.div.a.img["title"]
#else:
# brand = "???"
brand = container.div.find('img', {"title": True})["title"]
product_name = container.find("a", {"class": "item-title"}).text
shipping = container.find("li", {"class": "price-ship"}).text.strip()
print(brand)
print(product_name)
print(shipping)