Web Scraping with __doPostBack

我试图从https://www.brickeconomy.com/sets/theme/collectable-minifigures抓取数据(只需要每个乐高集的页面的url)，但网站上有使用Javascript __doPostBack函数的分页。我查看了许多其他相关答案，知道我需要查看POST请求以识别请求表单数据，如下所示:

请求表单数据截图

我的代码如下:

import requests
from bs4 import BeautifulSoup
url = "http://www.brickeconomy.com/sets/theme/collectable-minifigures"
page_content = requests.get(url).content
soup = BeautifulSoup(page_content, 'html.parser')
VIEWSTATEGENERATOR  = soup.find('input',{'id':'__VIEWSTATEGENERATOR'}).get('value')
VIEWSTATE  = soup.find('input',{'id':'__VIEWSTATE'}).get('value')
headers = {'user-agent': 'Mozilla/5.0'}
data = {  
"ctl00$ScriptManager1": "ctl00$ContentPlaceHolder1$ctlSets$UpdatePanelMain|ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
"ctl00$txtSearchHeader2": "",
"ctl00$txtSearchHeader": "",
"subthemesorter": "",
"setsorter": "SetNumberDESC",
"ctl00$LoginModalUsername": "",
"ctl00$LoginModalPassword": "",
"__EVENTTARGET": "ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
"__EVENTARGUMENT": "Page$2",
"__VIEWSTATE":VIEWSTATE,
"__VIEWSTATEGENERATOR": VIEWSTATEGENERATOR, 
"__ASYNCPOST": 'true'
}
res = requests.post(url, data=data, headers =headers).content
BeautifulSoup(res, 'html.parser').find_all(class_ = 'mb-5')

然而，它仍然显示来自第一页的数据。如果有任何建议，我将不胜感激。谢谢你！

您向错误的url发送了帖子请求。一旦我用正确的url替换了现有的url，脚本就开始工作了:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://www.brickeconomy.com'
start_url = 'http://www.brickeconomy.com/sets/theme/collectable-minifigures'
post_url = 'https://www.brickeconomy.com/sets/theme/sets/theme/collectable-minifigures'
data = {  
"ctl00$ScriptManager1": "ctl00$ContentPlaceHolder1$ctlSets$UpdatePanelMain|ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
"ctl00$txtSearchHeader2": "",
"ctl00$txtSearchHeader": "",
"subthemesorter": "",
"setsorter": "SetNumberDESC",
"ctl00$LoginModalUsername": "",
"ctl00$LoginModalPassword": "",
"__EVENTTARGET": "ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
"__EVENTARGUMENT": "Page$2",
"__ASYNCPOST": 'true'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
r = s.get(start_url)
soup = BeautifulSoup(r.text,"lxml")
data['__VIEWSTATE'] = soup.find('input',{'id':'__VIEWSTATE'}).get('value')
data['__VIEWSTATEGENERATOR'] = soup.find('input',{'id':'__VIEWSTATEGENERATOR'}).get('value')
res = s.post(post_url,data=data)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select("table.table > tr h4 > a"):
inner_url = urljoin(base,item.get("href"))
print(inner_url)

相关内容

最新更新

热门标签：