生成csv时索引超出范围



我正在尝试建立一个体育博彩程序。

现在我被困在生成csv的部分,其中包含前两个大学篮球赛季的所有盒子分数。它正在从我已经生成的csv中提取boxscore索引。

我一直得到索引超出范围的错误,一旦它达到10653迭代的进度条。我无法在csv中找到任何特定的内容,它正在读取第10653行。

我知道迭代对应于csv中的行,因为当我运行df = Boxscore(box_link).dataframe之前的所有代码行时,进度条在14980次迭代时完成,这与它正在读取的csv中的行数相同。

任何帮助都将非常感激。下面是代码和错误信息。

from sportsreference.ncaab.boxscore import Boxscore
start_season = 2020 # below code will pull data from all seasons starting from this year
box_df = None
schedule_df = pd.read_csv('ncaab - sheet81 - ncaab - sheet81.csv')#if only running for testing, a smaller csv may be used to speed up the process
season_df = schedule_df.loc[schedule_df.Season>=start_season]
for index, row in tqdm(season_df.iterrows()):
box_link = row['BoxscoreIndex']
_df = Boxscore(box_link).dataframe #The line to left is where the error keeps coming in "list index out of range". I ran everything above this and it works fine.  

if box_df is not None:
box_df = pd.concat([box_df,_df],axis=0)
else:
box_df = _df

box_df.to_csv('boxscores3.csv'.format(start_season),index=None)    
IndexError                                Traceback (most recent call last)
<ipython-input-24-91c5b71b03e2> in <module>
6 for index, row in tqdm(season_df.iterrows()):
7     box_link = row['BoxscoreIndex']
----> 8     _df = Boxscore(box_link).dataframe #The line to left is where the error keeps coming in "list index out of range". I ran everything above this and it works fine.
9 
10     if box_df is not None:
~DownloadsWPy64-3860python-3.8.6.amd64libsite-packagessportsreferencencaabboxscore.py in __init__(self, uri)
223         self._home_defensive_rating = None
224 
--> 225         self._parse_game_data(uri)
226 
227     def _retrieve_html_page(self, uri):
~DownloadsWPy64-3860python-3.8.6.amd64libsite-packagessportsreferencencaabboxscore.py in _parse_game_data(self, uri)
668             if short_field == 'away_record' or 
669                short_field == 'home_record':
--> 670                 value = self._parse_record(short_field, boxscore, index)
671                 setattr(self, field, value)
672                 continue
~DownloadsWPy64-3860python-3.8.6.amd64libsite-packagessportsreferencencaabboxscore.py in _parse_record(self, field, boxscore, index)
375         records = boxscore(BOXSCORE_SCHEME[field]).items()
376         records = [x.text() for x in records if x.text() != '']
--> 377         return records[index]
378 
379     def _find_boxscore_tables(self, boxscore):
IndexError: list index out of range

首先,只是想指出这里的.format()方法'boxscores3.csv'.format(start_season)没有做任何事情。它仍然会返回'boxscores3.csv'。你需要在文件名

中包含这个占位符例如,如果是start_season = '2020',那么'boxscores3_{0}.csv'.format(start_season)就会得到'boxscores3_2020.csv'

如果你想让文件名是动态的,改成:

box_df.to_csv('boxscores3_{0}.csv'.format(start_season),index=None)

box_df.to_csv('boxscores3_{some_variable}.csv'.format(some_variable = start_season),index=None)

box_df.to_csv('boxscores3_%s.csv' %start_season),index=None)

接下来,除非您可以提供该csv文件的示例,特别是第10653行,否则无法真正帮助您解决特定问题。

然而,在那之前,我可以提供一个使用espn api的替代解决方案。

你可以得到大学篮球比赛的盒子比分,只要你有gameId。所以这段代码将遍历每个日期(需要给出一个开始日期),获取每个游戏的gameid。然后使用gameid,可以从另一个api端点获取比分。不幸的是,boxscore不是以json格式返回,而是以html格式返回(这很好,因为我们可以使用pandas在表中读取)。

我不知道你到底需要什么或想要什么,但这可能会帮助你在学习python时看到其他获取数据的方法:

代码:

from tqdm import tqdm
import requests
import pandas as pd
import datetime

date_list = []
sdate = datetime.date(2021, 1, 1)   # start date
edate = datetime.date.today()  # end date
delta = edate - sdate       # as timedelta
for i in range(delta.days + 1):
day = sdate + datetime.timedelta(days=i)
date_list.append(day.strftime("%Y%m%d"))


headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'}
payload = {
'xhr': '1',
'device': 'desktop',
'country': 'us',
'lang': 'en',
'region': 'us',
'site': 'espn',
'edition-host': 'espn.com',
'site-type': 'full'}
# Get gameIds
gameId_dict = {}
for dateStr in tqdm(date_list):
url = 'https://secure.espn.com/core/mens-college-basketball/schedule/_/date/{dateStr}/group/50'.format(dateStr=dateStr)
games = requests.get(url, headers=headers, params=payload).json()['content']['schedule'][dateStr]['games']
gameId_dict[dateStr] = []
for game in games:
# Check if game was postponed
if game['status']['type']['name'] in ['STATUS_POSTPONED','STATUS_CANCELED','STATUS_SCHEDULED']:
continue
game_info = {}
game_info[game['id']] = {}
game_info[game['id']]['awayTeam'] = game['shortName'].split('@')[0].strip()
game_info[game['id']]['homeTeam'] = game['shortName'].split('@')[1].strip()
gameId_dict[dateStr].append(game_info)

full_df = pd.DataFrame()
# Box score - gameId needed
box_url = 'https://secure.espn.com/core/mens-college-basketball/boxscore'
for dateStr, games in tqdm(gameId_dict.items()):
for game in tqdm(games):
for gameId, teams in game.items():
payload = {
'gameId': gameId,
'xhr': '1',
'render': 'true',
'device': 'desktop',
'country': 'us',
'lang': 'en',
'region': 'us',
'site': 'espn',
'edition-host': 'espn.com',
'site-type': 'full'}

data = requests.get(box_url, headers=headers, params=payload).json()
away_df = pd.read_html(data['content']['html'], header=1)[0].rename(columns={'Bench':'Player'})
away_df = away_df[away_df['Player'] != 'TEAM']
away_df = away_df[away_df['Player'].notna()]
away_df['Team'] = teams['awayTeam']
away_df['Home_Away'] = 'Away'
away_df['Starter_Bench'] = 'Bench'
away_df.loc[0:4, 'Starter_Bench'] = 'Starter'
away_df['Player'] = away_df['Player'].str.split(r"([a-z]+)([A-Z].+)", expand=True)[2]
away_df[['Player','Team']] = away_df['Player'].str.extract('^(.*?)([A-Z]+)$', expand=True)

home_df = pd.read_html(data['content']['html'], header=1)[1].rename(columns={'Bench':'Player'})
home_df = home_df[home_df['Player'] != 'TEAM']
home_df = home_df[home_df['Player'].notna()]
home_df['Team'] = teams['homeTeam']
home_df['Home_Away'] = 'Home'
home_df['Starter_Bench'] = 'Bench'
home_df.loc[0:4, 'Starter_Bench'] = 'Starter'
home_df['Player'] = home_df['Player'].str.split(r"([a-z]+)([A-Z].+)", expand=True)[2]
home_df[['Player','Team']] = home_df['Player'].str.extract('^(.*?)([A-Z]+)$', expand=True)

game_df = away_df.append(home_df, sort = False)
game_df['Date'] = datetime.datetime.strptime(dateStr, '%Y%m%d').strftime('%m/%d/%y')
full_df = full_df.append(game_df, sort = False)
full_df = full_df.reset_index(drop=True)

输出:

print (full_df.head(30).to_string())
Player MIN    FG  3PT   FT OREB DREB REB AST STL BLK TO PF PTS  Team Home_Away Starter_Bench Pos      Date
0             H. Drame  22   2-7  0-2  0-0    1    1   2   0   0   1  1  4   4   SPU      Away       Starter   F  01/01/21
1             F. Drame  20   2-3  0-1  0-0    1    5   6   0   3   1  1  4   4   SPU      Away       Starter   F  01/01/21
2               M. Lee  24  2-11  0-4  1-2    1    2   3   0   0   0  3  0   5   SPU      Away       Starter   G  01/01/21
3             D. Banks  26  4-12  1-6  2-4    0    5   5   6   1   0  1  1  11   SPU      Away       Starter   G  01/01/21
4             D. Edert  32  6-10  2-4  1-2    0    4   4   0   2   0  1  2  15   SPU      Away       Starter   G  01/01/21
5           O. Diahame   1   0-1  0-0  0-0    0    0   0   0   0   0  0  0   0   SPU      Away         Bench   F  01/01/21
6             K. Ndefo  23  7-10  0-0  3-3    1    6   7   2   1   5  1  4  17   SPU      Away         Bench   F  01/01/21
7            B. Diallo  14   0-2  0-0  0-0    1    1   2   0   0   0  0  0   0   SPU      Away         Bench   G  01/01/21
8             T. Brake  24   1-2  0-1  0-0    0    0   0   1   0   0  0  1   2   SPU      Away         Bench   G  01/01/21
9           M. Silvera   6   0-0  0-0  0-0    0    1   1   1   0   0  1  0   0   SPU      Away         Bench   G  01/01/21
10            N. Kamba   8   0-1  0-0  0-0    0    0   0   0   0   0  2  0   0   SPU      Away         Bench   G  01/01/21
11            J. Fritz  38   5-9  0-0  4-5    2    8  10   4   1   3  1  3  14   CAN      Home       Starter   F  01/01/21
12            J. White  17   4-7  1-2  0-0    1    4   5   2   0   0  5  2   9   CAN      Home       Starter   F  01/01/21
13           A. Fofana  20   1-7  1-4  1-2    0    1   1   1   0   0  1  2   4   CAN      Home       Starter   G  01/01/21
14          A. Harried  23  3-10  1-4  0-1    2    5   7   1   1   1  0  1   7   CAN      Home       Starter   G  01/01/21
15        J. Henderson  37   3-8  3-5  5-6    0    1   1   2   0   0  1  1  14   CAN      Home       Starter   G  01/01/21
16      G. Maslennikov   2   0-2  0-1  0-0    0    0   0   0   0   0  1  1   0   CAN      Home         Bench   F  01/01/21
17            M. Green  18   3-4  0-0  2-2    1    4   5   2   1   0  2  1   8   CAN      Home         Bench   F  01/01/21
18          S. Hitchon   3   0-0  0-0  0-0    0    0   0   1   0   0  0  0   0   CAN      Home         Bench   F  01/01/21
19       S. Uijtendaal  20   2-4  1-2  0-0    0    0   0   0   1   0  0  2   5   CAN      Home         Bench   G  01/01/21
20          M. Brandon  19   4-5  1-2  0-0    0    3   3   2   2   0  2  1   9   CAN      Home         Bench   G  01/01/21
21           A. Ahemed   3   0-0  0-0  0-0    0    1   1   1   0   0  0  1   0   CAN      Home         Bench   G  01/01/21
22           K. Nwandu  34  5-13  1-3  0-1    1    3   4   3   1   0  3  1  11  NIAG      Away       Starter   F  01/01/21
23      G. Kuakumensah  23   1-2  1-2  1-2    0    2   2   1   0   0  1  1   4  NIAG      Away       Starter   F  01/01/21
24         N. Kratholm  18   4-7  0-0  3-5    2    2   4   1   0   0  0  2  11  NIAG      Away       Starter   F  01/01/21
25          M. Hammond  33  7-14  3-6  0-0    0    4   4   1   1   0  2  2  17  NIAG      Away       Starter   G  01/01/21
26          J. Roberts  28   2-6  2-6  2-2    0    2   2   3   1   0  2  3   8  NIAG      Away       Starter   G  01/01/21
27          J. Cintron  14   0-2  0-0  0-0    1    3   4   0   0   1  2  1   0  NIAG      Away         Bench   F  01/01/21
28  DonaldN. MacDonald   9   0-1  0-1  0-0    0    3   3   0   0   0  0  0   0  NIAG      Away         Bench   G  01/01/21
29          R. Solomon  25  4-11  0-2  2-2    1    3   4   0   3   0  0  1  10  NIAG      Away         Bench   G  01/01/21

您实际上并不需要使用该库。只要选择requests,beautifulsouppandas。图书馆可能会在底层使用它们。但现在这应该对你有用。我们可能需要添加一些东西(比如延迟/睡眠),当您在短时间内被太多请求阻塞时,我怀疑这是导致问题的原因,但这将使您继续前进。看看效果如何,我们可以调整一下。

import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from tqdm import tqdm
import time
season_df = pd.read_csv('C:/test/ncaab.csv')
links_list = season_df['BoxscoreIndex'].to_list()
delay = 30

def parse_tables(idx,basic_tables,advanced_tables):    
try:
basic = basic_tables[idx]
advanced = advanced_tables[idx]
team = basic.find('caption').text.split('(')[0].strip()
team = team.split('Table')[0].strip()

df_basic = pd.read_html(str(basic), header=1)[0]
df_basic = df_basic[df_basic['Starters'] == 'School Totals']
df_basic = df_basic.reset_index(drop=True)

df_advanced = pd.read_html(str(advanced), header=1)[0]
df_advanced = df_advanced[df_advanced['Starters'] == 'School Totals']
df_advanced = df_advanced.reset_index(drop=True)

drop_cols = []
for col in df_basic.columns:
if col in df_advanced.columns:
drop_cols.append(col)

df = df_basic.drop(['Starters'], axis=1).join(df_advanced.drop(drop_cols, axis=1))
df['Team'] = team
return df
except:
return None

def get_html(box_link):
url = 'https://www.sports-reference.com/cbb/boxscores/%s.html' %box_link
response = requests.get(url)

if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
return soup, True
else:
return None, False


visited_links = []
errors_list = []
box_df = pd.DataFrame()
for box_link in tqdm(links_list):
resp_success = False
# This will skip odd rows
if box_link in visited_links:
continue

while resp_success == False:
soup, resp_success = get_html(box_link)
if resp_success == False:
print ('Will retry in %s seconds.' %delay)
time.sleep(delay)

basic_tables = soup.find_all('table', {'id': re.compile('.*box-score-basic.*')})
advanced_tables = soup.find_all('table', {'id': re.compile('.*box-score-advanced.*')})

if len(basic_tables) < 2:
errors_list.append(box_link)

for idx in range(0, len(basic_tables)):
df = parse_tables(idx,basic_tables,advanced_tables)
box_df = box_df.append(df, sort=False)

visited_links.append(box_link)


box_df = box_df.reset_index(drop = True)
if len(errors_list) > 0:
print('n')
print ('You may want to investigate the following.')
for each in errors_list:
print('Error with: %s' %each)

最新更新