我有一个数据帧,其中团队名称以字符串形式给出。拆分字符串以获得单个团队名称的规则是
- 每个大写的角色都会组建一个新团队
- 除非前面的
- 第一个非数字或非字母字符后没有团队名称
我的解决方案如下。它有效,但我正在寻找改进的指针
- 整个拆分是否可以作为reg ex完成
- 任何比使用更快的都适用。我看到了一些关于使用并行版本加速应用程序的注意事项,但不知道是否有更快的方法可以做到这一点
import pandas as pd
import numpy as np
import re
def splitDfCap(next_row):
new_team = ''
team_list = []
last_char_num = False
for next_char in next_row.Teams:
stop_current = re.match('[A-Z]', next_char) or (re.match('[0-9]', next_char) and not(last_char_num))
if stop_current and len(new_team) > 0:
team_list.append(new_team)
new_team = ''
if re.match('[a-zA-Z0-9]', next_char):
new_team = new_team + next_char
last_char_num = re.match('[0-9]', next_char)
else:
break
if len(new_team) > 0:
team_list.append(new_team)
return team_list
#Setup
df = pd.DataFrame({'Population' : [1400, 100, 1000],
'Teams' : ['YankeesMetsBrooklyn[]', 'Rays', 'GiantsAthletics49ers'],
'Ad Market' : [10400, 2000, 8400]},
index = ['New York Area', 'Tampa Bay', 'SF Bay Area'])
#Create list of teams and then explode
df['Team List'] = df.apply(splitDfCap, axis='columns')
df = df.rename_axis(index='Region')
.explode('Team List')
.rename(columns={'Team List': 'Team'})
.drop(columns=['Teams'])
print(df)
Output is
Population Ad Market Team
Region
New York Area 1400 10400 Yankees
New York Area 1400 10400 Mets
New York Area 1400 10400 Brooklyn
Tampa Bay 100 2000 Rays
SF Bay Area 1000 8400 Giants
SF Bay Area 1000 8400 Athletics
SF Bay Area 1000 8400 49ers
import pandas as pd
df = pd.DataFrame({'Population' : [1400, 100, 1000],
'Teams' : ['YankeesMetsBrooklyn[]', 'Rays', 'GiantsAthletics49ers'],
'Ad Market' : [10400, 2000, 8400]},
index = ['New York Area', 'Tampa Bay', 'SF Bay Area'])
df['Teams']=df.Teams.str.findall('(?:[A-Z]|d+)[^A-Zd+W]*')
df = df.explode('Teams')
df.reset_index(inplace=True)
df.columns = ['Region','Population','Ad Market','Team']