r语言 - 基于条件滞后进行突变



我有来自IM聊天的数据,其中每一行都是一个击键。我想自动添加列most_recent_enter(在此处手动添加,以获取解释(,它跟踪keystroke==ENTER的最新行。

在此示例中,存在重叠的消息,因此最后一个点击ENTER的用户不一定是最近的用户。我在这里包括其他专栏,以显示我可用的信息。

x <- data.frame(overall_idx = 1:14,
sender=c("a","a","a","b","b","b","a",
"a","a","a","b","b","a","a"),
keystroke=c("H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T"),
ks_idx=c(0,1,2,0,1,2,0,1,2,3,0,1,3,3),
most_recent_enter=c(NA,NA,NA,"a","a","a","b","b",
"b","b","b","b","b","b")
)

有没有办法找到满足条件的最新行?

编辑在我的实际数据中,每个对话都标有实验编号。我将如何为每个实验重置此设置?

x <- data.frame(exp_num=c(rep(1,14),rep(2,14)),
overall_idx = c(1:14,1:14),
sender=c("a","a","a","b","b","b","a",
"a","a","a","b","b","a","a",
"a","a","a","b","b","b","a",
"a","a","a","b","b","a","a"),
keystroke=c("H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T",
"H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T"),
ks_idx=c(0,1,2,0,1,2,0,1,2,3,0,1,3,3,0,1,2,0,1,2,0,1,2,3,0,1,3,3),
most_recent_enter=c(NA,NA,NA,"a","a","a","b","b",
"b","b","b","b","b","b",
NA,NA,NA,"a","a","a","b","b",
"b","b","b","b","b","b")
)

编辑2:当第二个exp_num中有两个不同的发件人时,所选答案实际上失败。例如:

x <- data.frame(exp_num=c(rep(1,14),rep(2,14)),
overall_idx = c(1:14,1:14),
sender=c("a","a","a","b","b","b","a",
"a","a","a","b","b","a","a",
"c","c","c","d","d","d","c",
"c","c","c","d","d","c","c"),
keystroke=c("H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T",
"H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T"),
ks_idx=c(0,1,2,0,1,2,0,1,2,3,0,1,3,3,0,1,2,0,1,2,0,1,2,3,0,1,3,3),
most_recent_enter=c(NA,NA,NA,"a","a","a","b","b",
"b","b","b","b","b","b",
NA,NA,NA,"c","c","c","d","d",
"d","d","d","d","d","d")

为 Exp 1 和 Exp 2 中的most_recent_new_enter生成相同的as 和bs,而不是cs 和ds

我们可以根据"ENTER"值创建一个分组列,以创建"sender"的first元素作为most_recent,然后在ungroup之后lag该列

library(dplyr)
x %>%
group_by(grp = cumsum(keystroke == 'ENTER')) %>% 
mutate(most_recent_new_enter = case_when(grp > 0 ~ first(sender))) %>% 
ungroup %>% 
mutate(most_recent_new_enter = lag(most_recent_new_enter)) %>%
select(-grp)
# A tibble: 14 x 6
#   overall_idx sender keystroke ks_idx most_recent_enter most_recent_new_enter
#         <int> <fct>  <fct>      <dbl> <fct>             <fct>                
# 1           1 a      H              0 <NA>              <NA>                 
# 2           2 a      I              1 <NA>              <NA>                 
# 3           3 a      ENTER          2 <NA>              <NA>                 
# 4           4 b      K              0 a                 a                    
# 5           5 b      I              1 a                 a                    
# 6           6 b      ENTER          2 a                 a                    
# 7           7 a      W              0 b                 b                    
# 8           8 a      H              1 b                 b                    
# 9           9 a      I              2 b                 b                    
#10          10 a      C              3 b                 b                    
#11          11 b      W              0 b                 b                    
#12          12 b      H              1 b                 b                    
#13          13 a      H              3 b                 b                    
#14          14 a      T              3 b                 b   

对于更新后的帖子,我们可以添加一个group_by

x %>%
group_by(exp_num) %>%  
group_by(grp = cumsum(keystroke == 'ENTER'), .add = TRUE) %>%
mutate(most_recent_new_enter = case_when(grp > 0 ~ first(sender))) %>% 
group_by(exp_num) %>% 
mutate(most_recent_new_enter = lag(most_recent_new_enter)) %>%
select(-grp) %>%
as.data.frame
#exp_num overall_idx sender keystroke ks_idx most_recent_enter most_recent_new_enter
#1        1           1      a         H      0              <NA>                  <NA>
#2        1           2      a         I      1              <NA>                  <NA>
#3        1           3      a     ENTER      2              <NA>                  <NA>
#4        1           4      b         K      0                 a                     a
#5        1           5      b         I      1                 a                     a
#6        1           6      b     ENTER      2                 a                     a
#7        1           7      a         W      0                 b                     b
#8        1           8      a         H      1                 b                     b
#9        1           9      a         I      2                 b                     b
#10       1          10      a         C      3                 b                     b
#11       1          11      b         W      0                 b                     b
#12       1          12      b         H      1                 b                     b
#13       1          13      a         H      3                 b                     b
#14       1          14      a         T      3                 b                     b
#15       2           1      c         H      0              <NA>                  <NA>
#16       2           2      c         I      1              <NA>                  <NA>
#17       2           3      c     ENTER      2              <NA>                  <NA>
#18       2           4      d         K      0                 c                     c
#19       2           5      d         I      1                 c                     c
#20       2           6      d     ENTER      2                 c                     c
#21       2           7      c         W      0                 d                     d
#22       2           8      c         H      1                 d                     d
#23       2           9      c         I      2                 d                     d
#24       2          10      c         C      3                 d                     d
#25       2          11      d         W      0                 d                     d
#26       2          12      d         H      1                 d                     d
#27       2          13      c         H      3                 d                     d
#28       2          14      c         T      3                 d                     d

或者使用tidyrfill

library(tidyr)
x %>% 
mutate(most_recent_new_enter =  lag(case_when(keystroke == 'ENTER' ~ sender))) %>%
fill(most_recent_new_enter)

或使用data.table

library(data.table)
setDT(x)[keystroke == 'ENTER', most_recent_new_enter := sender][, 
most_recent_new_enter :=  shift(zoo::na.locf0(most_recent_new_enter))]

你也可以做:

x %>%
group_by(exp_num) %>% 
mutate(most_recent_enter = sender[
sapply(1:n(),
function(x) max(which(keystroke == 'ENTER')[which(keystroke == 'ENTER') < x])
)] 
)

输出(前几行(:

# A tibble: 28 x 6
# Groups:   exp_num [2]
exp_num overall_idx sender keystroke ks_idx most_recent_enter
<dbl>       <int> <fct>  <fct>      <dbl> <fct>            
1       1           1 a      H              0 NA               
2       1           2 a      I              1 NA               
3       1           3 a      ENTER          2 NA               
4       1           4 b      K              0 a                
5       1           5 b      I              1 a                
6       1           6 b      ENTER          2 a                
7       1           7 a      W              0 b                
8       1           8 a      H              1 b                
9       1           9 a      I              2 b                
10       1          10 a      C              3 b                
# ... with 18 more rows

基本上,您检查每个行号,这是对应于仍低于当前行号ENTER的最大索引,并将其用于子集sender