获取属性错误："NoneType"对象没有属性"文本"(网页抓取)

这是我关于web抓取的案例研究。我在最后一段代码中遇到了一个问题，"NoneType"对象没有属性"text"，所以我试图用"getattr"函数修复它，但没有成功。

import requests
from bs4 import BeautifulSoup
url = 'https://www.birdsnest.com.au/womens/dresses'
source = requests.get(url)
soup = BeautifulSoup(source.content, 'lxml')

productlist= soup.find_all('div', id='items')

productlinks = []
for item in productlist:
for link in item.find_all('a',href=True):
productlinks.append(url + link['href'])
print(len(productlinks))

productlinks = []
for x in range(1,28):
source = requests.get(f'https://www.birdsnest.com.au/womens/dresses?_lh=1&page={x}')
soup = BeautifulSoup(source.content, 'lxml')
for item in productlist:
for link in item.find_all('a',href=True):
productlinks.append(url + link['href'])
print(productlinks)

for link in productlinks:
source = requests.get(link)
soup = BeautifulSoup(source.content, 'lxml')
name = soup.find('h1',class_='item-heading__name').text.strip()
price = soup.find('p',class_='item-heading__price').text.strip()
feature = soup.find('div',class_='tab-accordion__content active').text.strip()
sum = {
'name':name,
'price':price,
'feature':feature
}
print(sum)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-7-d4d46558690d> in <module>()
3     soup = BeautifulSoup(source.content, 'lxml')
4 
----> 5     name = soup.find('h1',class_='item-heading__name').text.strip()
6     price = soup.find('p',class_='item-heading__price').text.strip()
7     feature = soup.find('div',class_='tab-accordion__content active').text.strip()
AttributeError: 'NoneType' object has no attribute 'text'
---------------------------------------------------------------------------

所以我试着用这个方法修复，但没有成功。

for link in productlinks:
source = requests.get(link)
soup = BeautifulSoup(source.content, 'lxml')
name = getattr(soup.find('h1',class_='item-heading__name'),'text',None)
price = getattr(soup.find('p',class_='item-heading__price'),'text',None)
feature = getattr(soup.find('div',class_='tab-accordion__content active'),'text',None)
sum = {
'name':name,
'price':price,
'feature':feature
}
print(sum)

这就是输出。它只显示"非类型">

{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}

首先，请始终关闭正在抓取的页面的JS。然后您会意识到标记类会发生变化，而这些类正是您想要针对的。

此外，当在页面中循环时，不要忘记Python的range()停止值是不包含的。也就是说，此range(1, 28)将在第27页停止。

以下是我的做法：

import json
import requests
from bs4 import BeautifulSoup

cookies = {
"ServerID": "1033",
"__zlcmid": "10tjXhWpDJVkUQL",
}
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}

def extract_info(bs: BeautifulSoup, tag: str, attr_value: str) -> list:
return [i.text.strip() for i in bs.find_all(tag, {"itemprop": attr_value})]

all_pages = []
for page in range(1, 29):
print(f"Scraping data from page {page}...")
current_page = f"https://www.birdsnest.com.au/womens/dresses?page={page}"
source = requests.get(current_page, headers=headers, cookies=cookies)
soup = BeautifulSoup(source.content, 'html.parser')
brand = extract_info(soup, tag="strong", attr_value="brand")
name = extract_info(soup, tag="h2", attr_value="name")
price = extract_info(soup, tag="span", attr_value="price")
all_pages.extend(
[
{
"brand": b,
"name": n,
"price": p,
} for b, n, p in zip(brand, name, price)
]
)
print(f"{all_pages}nFound: {len(all_pages)} dresses.")
with open("all_the_dresses2.json", "w") as jf:
json.dump(all_pages, jf, indent=4)

这会让你得到一个JSON所有的裙子。

{
"brand": "boho bird",
"name": "Prissy Dress",
"price": "$189.95"
},
{
"brand": "boho bird",
"name": "Dandelion Dress",
"price": "$139.95"
},
{
"brand": "Lula Soul",
"name": "Dandelion Dress",
"price": "$179.95"
},
{
"brand": "Honeysuckle Beach",
"name": "Cotton V-Neck A-Line Splice Dress",
"price": "$149.95"
},
{
"brand": "Honeysuckle Beach",
"name": "Lenny Pinafore",
"price": "$139.95"
},
and so on for the next 28 pages ...

相关内容

最新更新

热门标签：