from bs4 import BeautifulSoup
import requests
url = 'https://www.iplt20.com/stats/2021/most-runs'
source = requests.get(url)
soup = BeautifulSoup(source.text, 'html.parser')
soup.find_all('table', class_ ='np-mostruns_table')
这个网站是完全javascript的,你不能用请求加载javascript。
您必须使用自动浏览器,如selenium
或类似的。
我也建议使用扩展,当你是刮禁用javascript(开关打开/关闭)像这样
切换JS
如果您正在查找带有class的表,您应该使用:
soup.find("table",{"class":"np-mostruns_table"})
你不能得到这个表,因为它是动态加载的。您需要找到加载它的查询,并从中构建表。它有比网站上显示的更多的字段,所以您可以添加您需要的其他字段。我只给出了网站上那些字段的例子
import requests
import json
import pandas as pd
url = 'https://ipl-stats-sports-mechanic.s3.ap-south-1.amazonaws.com/ipl/feeds/stats/60-toprunsscorers.js?callback=ontoprunsscorers'
results = []
response = requests.get(url)
json_data = json.loads(response.text[response.text.find('(')+1:response.text.find(')')])
for player in json_data['toprunsscorers']:
data = {
'Player': player['StrikerName'],
'Mat': player['Matches'],
'Inns': player['Innings'],
'NO': player['NotOuts'],
'Runs': player['TotalRuns'],
'HS': player['HighestScore'],
'AVG': player['BattingAverage'],
'BF': player['Balls'],
'SR': player['StrikeRate'],
'100': player['Centuries'],
'50': player['FiftyPlusRuns'],
'4s': player['Fours'],
'6s': player['Sixes']
}
results.append(data)
df = pd.DataFrame(results)
print(df)
输出:
Player Mat Inns NO Runs HS ... BF SR 100 50 4s 6s
0 Jos Buttler 17 17 2 863 116 ... 579 149.05 4 4 83 45
1 K L Rahul 15 15 3 616 103* ... 455 135.38 2 4 45 30
2 Quinton De Kock 15 15 1 508 140* ... 341 148.97 1 3 47 23
3 Hardik Pandya 15 15 4 487 87* ... 371 131.26 0 4 49 12
4 Shubman Gill 16 16 2 483 96 ... 365 132.32 0 4 51 11
.. ... .. ... .. ... ... ... ... ... .. .. .. ..
157 Fazalhaq Farooqi 3 1 1 2 2* ... 8 25.00 0 0 0 0
158 Jagadeesha Suchith 5 2 0 2 2 ... 8 25.00 0 0 0 0
159 Tim Southee 9 5 1 2 1* ... 12 16.66 0 0 0 0
160 Nathan Coulter-Nile 1 1 1 1 1* ... 2 50.00 0 0 0 0
161 Anrich Nortje 6 1 1 1 1* ... 6 16.66 0 0 0 0