Python3,抓取返回[]



我有一个抓取网站的python脚本,我需要抓取一些信息(名称,价格,链接,ID)这些信息将放在MongoDB中。但是我的抓取功能有个问题。它返回给我一个空白列表。

你能帮我一下吗?对不起,我的英语不好,提前谢谢你
from bs4 import BeautifulSoup
import requests
import time
import pymongo
import difflib
import functools
URL = 'https://www.nickollsandperks.co.uk/New-and-Special-Offers/New-Whisky?order=relevance:asc'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/39.0.2171.95 Safari/537.36'}

def newScrapItems():
# global productNames,Names, links, Price, products_info_final
content3 = requests.get(URL, headers=headers)
soup3 = BeautifulSoup(content3.text, 'html.parser')
itemList = []

for products in soup3.find_all("div", {"class": 'facets-facet-browse-items'}):
for products_info in products.find_all("div", {"class": "facets-items-collection-view-row"}):
for products_info_final in products_info.find_all("div", {"class": "facets-item-cell-list"}):
for generalInfo in products_info_final.find_all("div", {"class": "facets-item-cell-list-right"}):
for links in generalInfo.find_all("meta"):
for itemNamesNext in products_info.find_all("div", {"class": "item-title-description"}):
for prePrice in generalInfo.find_all("div", {"class": "item-button"}):
for Names in itemNamesNext.find_all("span"):
for priceInfo in prePrice.find_all("div", {"class": "ProductViewsPrice.Price"}):
for Price in priceInfo.find_all("span", {"class": "product-views-price-lead"}):
productNames = {}
productNames['price'] = Price.get_text()
productNames['name'] = Names.get_text()
productNames['link'] = links['content']
productNames['ID'] = products_info_final['data-item-id']
itemList.append(productNames)

return itemList
newItems = newScrapItems()
print(newScrapItems())

返回:

[]
Process finished with exit code 0

我试着搜索这个麻烦,但这并没有给我结果。我真的希望有人能帮我解决这个问题,因为我已经为此挣扎了几天了。

我把它当作另一个站点的字符串,但是我嵌套得更少:

content3 = requests.get(URL, headers=headers)
soup3 = BeautifulSoup(content3.text, 'html.parser')
newItemList = []
for products in soup3.find_all("li", {"class": 'product-item'}):
for products_info in products.find_all("strong", {"class": "product-item-name"}):
for name in products_info.find_all("a"):
for productsPriceInfo in products.find_all("div", {"class": "price-box price-final_price"}):
for productsPriceInfoAdv in productsPriceInfo.find_all("span", {
"class": "price-wrapper price-including-tax"}):
for finalPrice in productsPriceInfoAdv.find_all("span", {"class": "price"}):
productNames = {}
productNames['name'] = name['title']
productNames['price'] = finalPrice.get_text()
productNames['link'] = name['href']
productNames['ID'] = productsPriceInfo['data-product-id']
# dict = {'names': name['title']}
newItemList.append(productNames)
return newItemList

这个返回的字符串格式- name: "name", link:"等。

重写为使用父元素集合的单循环。选择包含所有数据块的父节点;每个产品1块;然后,从循环中的该块中进行关系选择,以获取每个块中的项。

我假设id是产品SKU。

import requests
import numpy as np
from bs4 import BeautifulSoup as bs
newItemList = []
base = 'https://www.nickollsandperks.co.uk/'
r = requests.get('https://www.nickollsandperks.co.uk/New-and-Special-Offers/New-Whisky?order=relevance:asc')
soup = bs(r.content, 'lxml')
for listing in soup.select('.facets-items-collection-view-cell-span12'):
price = listing.select_one('.product-views-price') #price
if not price is None: #not every listing has a price
price = price.text
else:
price = np.NaN

newItemList.append(
{'name': listing.select_one('.facets-item-cell-list-name span').text,
'link': base + listing.select_one('.facets-item-cell-list-name')['href'],
'price': price, 
'id': listing.select_one('.facets-item-cell-list')['data-sku']
}
)

最新更新