获取一个值在pandas系列中以前出现的索引



假设我有一个数据框架

>> df = pd.DataFrame({'code': ['A', 'A', 'B', 'B', 'C', 'C', 'C', 'B', 'A', 'C', 'C', 'B' ]})
code
0   A
1   A
2   B
3   B
4   C
5   C
6   C
7   B
8   A
9   C
10  C
11  B

我想创建另一个列,它给出了相同值的下一个连续出现之前的代码项,以及前一个出现的目标索引。例如:

code target prev_idx
0   A [] NaN
1   A [] 0
2   B [] NaN
3   B [] 2
4   C [] NaN
5   C [] 4
6   C [] 5
7   B [C, C, C] 3
8   A [B, B, C, C, C, B] 1
9   C [B, A] 6
10  C [] 9
11  B [A, C, C] 7

最快的方法是什么?

def get_target(row,df=None):
"""
this function is applied to all rows in df
row.name is the index of the current row
row.code is the value in the code column
did row.code value appear ever before the current row?
if so return the segments starting from its last appearance till current row
"""
target = (df.code[:row.name]==row.code)
if target.any():
prev_idx = target[target].index[-1]
return {'target':df.code[prev_idx:row.name].tolist()[1:],
'prev_idx':prev_idx}
elif target.empty:  # if this is the first row
return {'target':[],'prev_idx':-1}
# return the index of the previous row
return {'target':[],'prev_idx':target.shift().index[-1]}
df[['target','prev_idx']] = df.apply(get_target,df=df,axis=1,result_type='expand')
df

,

prev_idx03[' C ', ' C ', ' C ']3[' B ', ' B ', ' C ', ' C ', ' C ', ' B ']['B', 'A'][' A ', ' C ', ' C ']

另寻出路。对于我的实际应用程序(数千行),它比@tozCSS发布的答案要快,可能是因为我们不会每次都搜索整个数据框(target = (df.code[:row.name]==row.code))。作为奖励,即使索引不是从0len(df),它也可以工作。

def get_target(row, data, col):
if not row['first_of_seq']:
return []
else:
return data.iloc[data.index.get_loc(row['prev_idx'] + 1): data.index.get_loc(row.name)][col].tolist()
def get_elements_in_between(data, col):
data = data.copy()
# get the index of the previous occurrence of the item
old_idx_name = data.index.name
data = data.rename_axis('dummy_idx').reset_index()
data['prev_idx'] = data.groupby(col)['dummy_idx'].shift()
# check if the item is the first of consecutive occurrences or not
data['first_of_seq'] = ((data['prev_idx'] - data['dummy_idx']) != -1) & (~data['prev_idx'].isna())
# convert floats to integers (and NaN to <NA>)
data['prev_idx'] = data['prev_idx'].astype('Int64')
# revert to the original indexing
data = data.set_index('dummy_idx')
data = data.rename_axis(old_idx_name)
# get the targets using the target indices
data['target'] = data.apply(get_target, args=(data, col), axis=1)

data.drop(columns=['first_of_seq'], inplace=True)

return data

我们得到:

>> get_elements_in_between(df, 'code')
code    prev_idx    target
0   A   <NA>    []
1   A   0   []
2   B   <NA>    []
3   B   2   []
4   C   <NA>    []
5   C   4   []
6   C   5   []
7   B   3   [C, C, C]
8   A   1   [B, B, C, C, C, B]
9   C   6   [B, A]
10  C   9   []
11  B   7   [A, C, C]

最新更新