Python正则表达式:re.findall方法抛出列表索引超出范围错误



我正在使用python正则表达式学习web抓取,并练习以下脚本源代码,但当我运行时,它会抛出IndexError: list index out of range

import re
import json
import requests
url = 'https://www.att.com/buy/phones/'
html_text = requests.get(url).text
data = json.loads(re.findall(r'__NEXT_DATA__ = (.*?});', html_text)[0])
print(json.dumps(data['props']['pageProps']['deviceList'], indent=4))

您所面临的问题是web动态的直接结果。网站不是静态的,2019年的解决方案可能不起作用。我建议使用Beautiful Soup(bs4(来获得更健壮的脚本,而不是使用自定义Regex来查找JSON。

下面的代码会给你想要的;

import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.att.com/buy/phones/'
html_text = requests.get(url).text
soup = BeautifulSoup(html_text)
data = json.loads(soup.find('script', id='__NEXT_DATA__').text)
print(json.dumps(data['props']['initialReduxState']['solr']['deviceList'], indent=4))  

代码的解释

请求库从给定的URL获取原始HTML文本,我们使用bs4对其进行解析。默认值为lxml解析器。然后,我们使用find函数搜索id为">NEXT_DATA"的脚本,从中我们可以获得脚本内部的文本,该文本是JSON。最后,我们加载了json库,并找到了"deviceList"的新位置。有关bs4的更多文档,请参阅https://www.crummy.com/software/BeautifulSoup/bs4/doc

长JSON 的第一个输出

{
"firstNet": "notApplicable",
"productFamily": "Phn13",
"comingSoon": false,
"skuId": "sku2360531",
"brand": "Apple",
"displayContentItems": [],
"deviceGroup": "network",
"starRatings": 4.5962,
"numOfStarReviews": 2959,
"mobileImageUrl": [
"/idpassets/global/devices/phones/apple/apple-iphone-13/defaultimage/pink-hero-zoom.png?imwidth=219"
],
"largeImageURL": "//www.att.com/catalog/en/skus/images/apple-iphone%2013-pink-450x350.png",
"model": "iPhone 13",
"productName": "Apple iPhone 13",
"billCode": "6164D",
"name": "jared",
"PDPPageURL": [
"/buy/phones/apple-iphone-13-128gb-pink.html"
],
"prepaid": "",
"productURL": "//www.att.com/cellphones/iphone/apple-iphone-13.html#sku=sku2360531",
"condition": "New",
"productId": "prod10340592",
"htmlColor": "#FADDD7",
"isPrepaid": false,
"isRefurbished": false,
"isPreOwned": false,
"isPrePreOrderable": false,
"type": "Device",
"color": "Pink",
"FinalPriceIRU": 22.23,
"FinalPriceCRU": 22.23,
"FinalPlanType": "monthly",
"FinalPrice": 22.23,
"FinalnextUpCharge": [
0
],
"FinalIRUnextUpCharge": [
0
],
"FinalCRUnextUpCharge": [
0
],
"FinalCommitmentTerm": "NE36MNUP",
"FinalCommitmentTermCRU": "NE36MNUP",
"FinalCommitmentTermIRU": "NE36MNUP",
"FinalBasePriceCRU": 22.23,
"FinalBasePriceIRU": 22.23,
"FinalPlanTypeCRU": "monthly",
"FinalPlanTypeIRU": "monthly",
"FinalBasePrice": 22.23,
"FinalTermLength": 36,
"FinalTermLengthIRU": 36,
"FinalTermLengthCRU": 36,
"consumerOfferDescription": "$0 w/Trade",
"cruOfferDescription": "$0 w/Trade",
"iruOfferDescription": "$0 w/Trade",
"consumerOfferDescriptionAL": "$0 w/Trade",
"consumerOfferDescriptionUP": "$0 w/Trade",
"iruOfferDescriptionAL": "$0 w/Trade",
"iruOfferDescriptionUP": "$0 w/Trade",
"cruOfferDescriptionAL": "$0 w/Trade",
"cruOfferDescriptionUP": "$0 w/Trade",
"allProductIds": [
"prod10340592",
"prod10340591",
"prod10340593"
],
"allSkuIds": [
"sku2360531",
"sku2360535",
"sku2360534",
"sku2360527",
"sku2360528",
"sku2360530",
"sku2360529",
"sku2360537",
"sku2360526",
"sku2360536",
"sku2360533",
"sku10940263",
"sku10940264",
"sku10940268",
"sku10940269"
],
"allBillCodes": [
"6164D",
"6166D",
"6162D",
"6165D",
"6163D",
"6169D",
"6171D",
"6167D",
"6170D",
"6168D",
"6174D",
"6176D",
"6172D",
"6175D",
"6173D"
],
"tradeInLegalModalPath": "/idpassets/fragment/legal/prod/legalcontent/wireless/offers/19900012/19900012_offertray_lm.cmsfeed.js",
"tradeInLegalText": "Requ2019s elig. unlimited (speed restru2019s apply) & trade-in. Price after 36 mo. credits. Other terms apply. ",
"tradeInShortLegalLinkLabel": "See offer details",
"tradeInPromoReference": "19900012",
"tradeInMonthlyPromoPrice": "0",
"tradeInLegalModalPathCRU": "/idpassets/fragment/legal/prod/legalcontent/wireless/offers/19900012/19900012_offertray_lm.cmsfeed.js",
"tradeInLegalTextCRU": "Requ2019s elig. unlimited (speed restru2019s apply) & trade-in. Price after 36 mo. credits. Other terms apply. ",
"tradeInShortLegalLinkLabelCRU": "See offer details",
"tradeInPromoReferenceCRU": "19900012",
"tradeInMonthlyPromoPriceCRU": "0"
}

最新更新