Pandas条件填充失败


import pandas as pd
import numpy as np
sample_data = [
{'Date': '13-12-2020', 'usable': 1, 'infected': 'Case3'},
{'Date': '14-12-2020', 'usable': 1},
{'Date': '15-12-2020', 'usable': 0},
{'Date': '16-12-2020', 'usable': 1, 'infected': 'Case33'},
{'Date': '17-12-2020', 'usable': 1},
{'Date': '18-12-2020', 'usable': 1},  
{'Date': '19-12-2020', 'usable': 0},
{'Date': '20-12-2020', 'usable': 0},
{'Date': '21-12-2020', 'usable': 0, 'infected': 'Case#'},
{'Date': '22-12-2020', 'usable': 1},
{'Date': '23-12-2020', 'usable': 1},
{'Date': '24-12-2020', 'usable': 0},
{'Date': '25-12-2020', 'usable': 0},
{'Date': '26-12-2020', 'usable': 1, 'infected': 'Case46'},
{'Date': '27-12-2020', 'usable': 0},
{'Date': '28-12-2020', 'usable': 1},
]
df = pd.DataFrame(sample_data)
df['infected'] = df['infected'].ffill(limit=2).bfill(limit=2)
df['infected'] = np.where(df['usable']==0, np.NaN, df['infected'])

以上是我的数据框架和我是如何做填充。希望在以下条件下,向前和向后填充最多2个最近的位置:

  1. 仅在可用时填充受感染值=1
  2. 填充到每边最多2个最近的入口(同样仅当可用= 1时)
  3. 希望填充达到零时停止

但是,下面的代码段得到错误的输出:

df['infected'] = df['infected'].ffill(limit=2).bfill(limit=2)
df['infected'] = np.where(df['usable']==0, np.NaN, df['infected'])

预期输出:

expected = [
{'Date': '13-12-2020', 'usable': 1, 'infected': 'Case3'},
{'Date': '14-12-2020', 'usable': 1, 'infected': 'Case3'},
{'Date': '15-12-2020', 'usable': 0},
{'Date': '16-12-2020', 'usable': 1, 'infected': 'Case33'},
{'Date': '17-12-2020', 'usable': 1, 'infected': 'Case33'},
{'Date': '18-12-2020', 'usable': 1, 'infected': 'Case33'},  
{'Date': '19-12-2020', 'usable': 0},
{'Date': '20-12-2020', 'usable': 0},
{'Date': '21-12-2020', 'usable': 0, 'infected': 'Case#'},
{'Date': '22-12-2020', 'usable': 1},
{'Date': '23-12-2020', 'usable': 1},
{'Date': '24-12-2020', 'usable': 0},
{'Date': '25-12-2020', 'usable': 0},
{'Date': '26-12-2020', 'usable': 1, 'infected': 'Case46'},
{'Date': '27-12-2020', 'usable': 0},
{'Date': '28-12-2020', 'usable': 1},
]
df_expected = pd.DataFrame(expected)

'28-12-2020'没有被感染为'Case46',因为27-12-2020有可用= 0,所以无法转移(ffill)

使用bfill和ffill的解决方案

import pandas as pd
import numpy as np
sample_data = [
{'Date': '13-12-2020', 'usable': 1, 'infected': 'Case3'},
{'Date': '14-12-2020', 'usable': 1},
{'Date': '15-12-2020', 'usable': 0},
{'Date': '16-12-2020', 'usable': 1, 'infected': 'Case33'},
{'Date': '17-12-2020', 'usable': 1},
{'Date': '18-12-2020', 'usable': 1},  
{'Date': '19-12-2020', 'usable': 0},
{'Date': '20-12-2020', 'usable': 0},
{'Date': '21-12-2020', 'usable': 0, 'infected': 'Case#'},
{'Date': '22-12-2020', 'usable': 1},
{'Date': '23-12-2020', 'usable': 1},
{'Date': '24-12-2020', 'usable': 0},
{'Date': '25-12-2020', 'usable': 0},
{'Date': '26-12-2020', 'usable': 1, 'infected': 'Case46'},
{'Date': '27-12-2020', 'usable': 0},
{'Date': '28-12-2020', 'usable': 1},
]
df = pd.DataFrame(sample_data)
# Make a infected1 where we keep only the ones that we want to extend
df['infected1'] =  np.where(df['usable']==0, np.NaN, df['infected'])
# Make a infected2 with the extension of column1
df['infected2'] = df['infected1'].ffill(limit=1).bfill(limit=1)
# Remove the ones that should not have been extended (usable is 0)
df['infected3'] =  np.where(df['usable']==0, np.NaN, df['infected'])
# Make a infected3 with the extension of column1 (this is the extension up to the second)
df['infected4'] = df['infected3'].ffill(limit=1).bfill(limit=1)
# Remove the ones that should not have been extended (usable is 0)
df['infected5'] =  np.where(df['usable']==0, np.NaN, df['infected4'])
# Merge the original column with the result (without removing the infected )
df['infected'] = np.where(df['infected4'].isna(),df['infected'],df['infected5'])
# Print just for understanding of code
print(df)
# All above could be made in a for loop if you need more than 2 distance
# Cleanup the not needed columns
df = df.drop(['infected'+str(x) for x in range(1,6)], axis=1)
print(df)

最后一次打印输出为:

Date  usable infected
0   13-12-2020       1    Case3
1   14-12-2020       1    Case3
2   15-12-2020       0      NaN
3   16-12-2020       1   Case33
4   17-12-2020       1   Case33
5   18-12-2020       1      NaN
6   19-12-2020       0      NaN
7   20-12-2020       0      NaN
8   21-12-2020       0    Case#
9   22-12-2020       1      NaN
10  23-12-2020       1      NaN
11  24-12-2020       0      NaN
12  25-12-2020       0      NaN
13  26-12-2020       1   Case46
14  27-12-2020       0      NaN
15  28-12-2020       1      NaN

您可以通过使用适当的掩码(解释在注释中)来做到这一点:

# Make sure you select those rows whose previous row's usable is not 0 and it itself is usable==1 (satisfies your third condition)
m1 = (df["usable"].shift().ne(0)) & (df["usable"].eq(1))
# Also select the rows which are usable and have a valid infected value (the fill values)
m2 = (df["usable"].eq(1)) & (df["infected"].notna())
# Then select with m1 | m2 to ffill and then concat with remaining rows
out = pd.concat([df[~m1 & ~m2], df[m1 | m2].ffill(limit=2)]).sort_values(
"Date", ignore_index=True
)
print (out)
Date  usable infected
0   13-12-2020       1    Case3
1   14-12-2020       1    Case3
2   15-12-2020       0      NaN
3   16-12-2020       1   Case33
4   17-12-2020       1   Case33
5   18-12-2020       1   Case33
6   19-12-2020       0      NaN
7   20-12-2020       0      NaN
8   21-12-2020       0    Case#
9   22-12-2020       1      NaN
10  23-12-2020       1      NaN
11  24-12-2020       0      NaN
12  25-12-2020       0      NaN
13  26-12-2020       1   Case46
14  27-12-2020       0      NaN
15  28-12-2020       1      NaN

最新更新