在包含列person_id
(int),dates
(datetimes)和is_hosp
(boolean)的数据框df
中,我需要找出每个person_id的每个日期,如果is_host在行日期之后的15天内为真。
编辑:
根据对原始问题的相关评论,我应该指定每行/person_id的观测日期并不总是连续的(即person_id可以在2018年1月1日进行观测,然后在2018年8月1日进行下一次观测)。
我已经写了下面的代码,但是它很慢。
为了复制这个问题,这里有一个与我正在处理的数据框架格式相同的假数据框架:
import pandas as pd
import datetime as dt
import numpy as np
count_unique_person = 10
dates = pd.date_range(dt.datetime(2019, 1, 1), dt.datetime(2021, 1, 1))
dates = dates.to_pydatetime().tolist()
person_id = [i for i in range(1, count_unique_person + 1) for _ in range(len(dates))]
dates = dates * count_unique_person
sample_arr = [True, False]
col_to_check = np.random.choice(sample_arr, size = len(dates))
df = pd.DataFrame()
df['person_id'] = person_id
df['dates'] = dates
df['is_hosp'] = col_to_check
这是我如何实现检查(列is_inh_15d
),但它需要太长时间运行(我的原始数据框包含一百万行):
is_inh = []
is_inh_idx = []
for each_p in np.unique(df['person_id'].values):
t_df = df[df['person_id'] == each_p].copy()
for index, e_row in t_df.iterrows():
s_dt = e_row['dates']
e_dt = s_dt + dt.timedelta(days = 15)
t_df2 = t_df[(t_df['dates'] >= s_dt) & (t_df['dates'] <= e_dt)]
is_inh.append(np.any(t_df2['is_hosp'] == True))
is_inh_idx.append(index)
h_15d_df = pd.DataFrame()
h_15d_df['is_inh_15d'] = is_inh
h_15d_df['is_inh_idx'] = is_inh_idx
h_15d_df.set_index('is_inh_idx', inplace = True)
df = df.join(h_15d_df)
我不知道如何对每一行检查未来15天的逻辑进行矢量化,以查看是否"is_hosp"是真的。
有人能告诉我吗?
.rolling()
在时间序列数据或类似日期时间的索引上接受"16D"
16天。(16天是指某一行的第一天及其之后的15天)
is_inh_15d = (
df.sort_values('dates', ascending=False)
.groupby('person_id')
.rolling('16D', on='dates')
.is_hosp.max() == 1
)
df2 = df.join(is_inh_15d, on=['person_id', 'dates'], rsuffix='_15d')
我想我找到了一些东西。我在开始时修改了数据生成,以便有更多的False。sample_arr = [True, False]
修改为sample_arr = [True] + [False] * 10
。对于前一行,真的很难得到15天False。
我找到的解决方案是:
to_concat = []
for i in range(15):
to_concat.append(
df.groupby('person_id')
# the resample get the 15 days following and I use the offset to move
# forward
.resample('15D', offset=pd.to_timedelta(i, 'D'), on='dates')
# max function give if there is a True (consider as 1)
.max()
# there is some date outside initial date range due to the offset I
# removed with the following line
.drop(['dates', 'person_id'], axis='columns')[
lambda x: (x.index.get_level_values('dates') >= df['dates'].min())
& (x.index.get_level_values('dates') <= df['dates'].max())
]
)
# I reset index in order to have them as column
# which more looks like your initial dataframe
df = pd.concat(to_concat).sort_index().reset_index()
我尝试通过在两个结果数据框上执行pd.testing.assert_frame_equal
来比较这两种方法,我得到is_inh_15d
列值的差异为2%。我不知道它到底是从哪里来的。
解决方案似乎比Linux上的time
命令快8倍左右:
python ./op_solution.py 6,90s user 0,79s system 113% cpu 6,797 total
python ./other_solution.py 0,86s user 0,76s system 231% cpu 0,699 total
我修改了这个方法,按人员和日期排序,然后使用每个人的约会间隔天数来生成h_15d_df列。由于数据集的原因,这有点难检查,但我将检查的天数从15天减少到1天,似乎可行。
start = time.time()
ori_func()
end = time.time()
print(end - start)
16.748218297958374
start = time.time()
new_func()
end = time.time()
print(end - start)
0.15230989456176758
这有点复杂,但我测量了~x100倍的改进
import pandas as pd
import datetime as dt
import numpy as np
import time
def create_df():
count_unique_person = 10
dates = pd.date_range(dt.datetime(2019, 1, 1), dt.datetime(2021, 1, 1))
dates = dates.to_pydatetime().tolist()
person_id = [i for i in range(1, count_unique_person + 1) for _ in range(len(dates))]
dates = dates * count_unique_person
sample_arr = [True, False]
col_to_check = np.random.choice(sample_arr, size = len(dates))
df = pd.DataFrame()
df['person_id'] = person_id
df['dates'] = dates
df['is_hosp'] = col_to_check
return(df)
def ori_func():
df = create_df()
is_inh = []
is_inh_idx = []
for each_p in np.unique(df['person_id'].values):
t_df = df[df['person_id'] == each_p].copy()
for index, e_row in t_df.iterrows():
s_dt = e_row['dates']
e_dt = s_dt + dt.timedelta(days = 15)
t_df2 = t_df[(t_df['dates'] >= s_dt) & (t_df['dates'] <= e_dt)]
is_inh.append(np.any(t_df2['is_hosp'] == True))
is_inh_idx.append(index)
h_15d_df = pd.DataFrame()
h_15d_df['is_inh_15d'] = is_inh
h_15d_df['is_inh_idx'] = is_inh_idx
h_15d_df.set_index('is_inh_idx', inplace = True)
df = df.join(h_15d_df)
return(df)
def ExpandGaps(gaps,d,length):
expand_gaps = []
for x in gaps:
if x <= d:
expand_gaps += [True]*x
if x > d:
expand_gaps += [False]*(x-d)
expand_gaps += [True]*d
##Fill in the remainder with false
expand_gaps += [False]*(length - len(expand_gaps))
return(expand_gaps)
def new_func():
df = create_df()
is_inh = []
is_inh_idx = []
#Sort by ID and dates.
df = df.sort_values(by=["person_id","dates"])
for each_p in np.unique(df['person_id'].values):
t_df = df[df['person_id'] == each_p].copy()
#Create list of dates per patient where is_hosp==True
if t_df.iloc[0]['is_hosp'] == False:
#true_dates = t_df.iloc[0]['dates'](t_df['dates'][t_df['is_hosp'] == True])
#Dates where is_hosp is true
true_dates = t_df['dates'][(t_df['is_hosp'] == True)]
first_row = pd.Series([t_df['dates'][t_df.index[0]]])
#Add first row to start if is_hosp == False
true_dates = pd.concat([first_row,true_dates])
else:
true_dates = t_df['dates'][t_df['is_hosp'] == True]
##Calculate the gaps between the visits and convert to number of days
gaps = np.array([(t - s).days for s, t in zip(true_dates, true_dates[1:])])
expand_gaps = ExpandGaps(gaps,15,len(t_df))
is_inh += expand_gaps
h_15d_df = pd.DataFrame()
h_15d_df['is_inh_15d'] = is_inh
#h_15d_df['is_inh_idx'] = is_inh_idx
#h_15d_df.set_index('is_inh_idx', inplace = True)
df = df.join(h_15d_df)
return(df)
start = time.time()
ori_func()
end = time.time()
print(end - start)
start = time.time()
new_func()
end = time.time()
print(end - start)