这是我关于web抓取的案例研究。我在最后一段代码中遇到了一个问题,"NoneType"对象没有属性"text",所以我试图用"getattr"函数修复它,但没有成功。
''
import requests
from bs4 import BeautifulSoup
url = 'https://www.birdsnest.com.au/womens/dresses'
source = requests.get(url)
soup = BeautifulSoup(source.content, 'lxml')
''
productlist= soup.find_all('div', id='items')
''
productlinks = []
for item in productlist:
for link in item.find_all('a',href=True):
productlinks.append(url + link['href'])
print(len(productlinks))
''
productlinks = []
for x in range(1,28):
source = requests.get(f'https://www.birdsnest.com.au/womens/dresses?_lh=1&page={x}')
soup = BeautifulSoup(source.content, 'lxml')
for item in productlist:
for link in item.find_all('a',href=True):
productlinks.append(url + link['href'])
print(productlinks)
''
for link in productlinks:
source = requests.get(link)
soup = BeautifulSoup(source.content, 'lxml')
name = soup.find('h1',class_='item-heading__name').text.strip()
price = soup.find('p',class_='item-heading__price').text.strip()
feature = soup.find('div',class_='tab-accordion__content active').text.strip()
sum = {
'name':name,
'price':price,
'feature':feature
}
print(sum)
''
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-7-d4d46558690d> in <module>()
3 soup = BeautifulSoup(source.content, 'lxml')
4
----> 5 name = soup.find('h1',class_='item-heading__name').text.strip()
6 price = soup.find('p',class_='item-heading__price').text.strip()
7 feature = soup.find('div',class_='tab-accordion__content active').text.strip()
AttributeError: 'NoneType' object has no attribute 'text'
---------------------------------------------------------------------------
所以我试着用这个方法修复,但没有成功。
for link in productlinks:
source = requests.get(link)
soup = BeautifulSoup(source.content, 'lxml')
name = getattr(soup.find('h1',class_='item-heading__name'),'text',None)
price = getattr(soup.find('p',class_='item-heading__price'),'text',None)
feature = getattr(soup.find('div',class_='tab-accordion__content active'),'text',None)
sum = {
'name':name,
'price':price,
'feature':feature
}
print(sum)
这就是输出。它只显示"非类型">
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
首先,请始终关闭正在抓取的页面的JS
。然后您会意识到标记类会发生变化,而这些类正是您想要针对的。
此外,当在页面中循环时,不要忘记Python的range()
停止值是不包含的。也就是说,此range(1, 28)
将在第27
页停止。
以下是我的做法:
import json
import requests
from bs4 import BeautifulSoup
cookies = {
"ServerID": "1033",
"__zlcmid": "10tjXhWpDJVkUQL",
}
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}
def extract_info(bs: BeautifulSoup, tag: str, attr_value: str) -> list:
return [i.text.strip() for i in bs.find_all(tag, {"itemprop": attr_value})]
all_pages = []
for page in range(1, 29):
print(f"Scraping data from page {page}...")
current_page = f"https://www.birdsnest.com.au/womens/dresses?page={page}"
source = requests.get(current_page, headers=headers, cookies=cookies)
soup = BeautifulSoup(source.content, 'html.parser')
brand = extract_info(soup, tag="strong", attr_value="brand")
name = extract_info(soup, tag="h2", attr_value="name")
price = extract_info(soup, tag="span", attr_value="price")
all_pages.extend(
[
{
"brand": b,
"name": n,
"price": p,
} for b, n, p in zip(brand, name, price)
]
)
print(f"{all_pages}nFound: {len(all_pages)} dresses.")
with open("all_the_dresses2.json", "w") as jf:
json.dump(all_pages, jf, indent=4)
这会让你得到一个JSON
所有的裙子。
{
"brand": "boho bird",
"name": "Prissy Dress",
"price": "$189.95"
},
{
"brand": "boho bird",
"name": "Dandelion Dress",
"price": "$139.95"
},
{
"brand": "Lula Soul",
"name": "Dandelion Dress",
"price": "$179.95"
},
{
"brand": "Honeysuckle Beach",
"name": "Cotton V-Neck A-Line Splice Dress",
"price": "$149.95"
},
{
"brand": "Honeysuckle Beach",
"name": "Lenny Pinafore",
"price": "$139.95"
},
and so on for the next 28 pages ...