r语言 - 当一个列改变值时,将数据帧子集



我有一个曲棍球比赛数据集。我的目标是创建一个子集,在这个子集中,球员不会把冰球翻转过来。

数据集包含一个标记为"营业额"的二进制列;如果比赛导致失误,则为1;如果比赛成功,则为0。我的目标是为营业额列中连续为0的每个行子集创建一个唯一的id。

# Example data:
set.seed(33)
x <- runif(20)
y <- runif(20)
df <-as.data.frame(cbind(x,y)) 
df$turnover = ifelse(x<0.5,1,0)
#desired output:
play_id <- c(1,2,3,4,4,4,5,6,7,8,9,10,11,12,13,13,13,14,15,16)
df$play_id <- play_id
df

虽然有点长,但只有dplyr。(虽然data.table::rleid()也是我最喜欢的功能)

df %>% group_by(turnover, play_id = cumsum(turnover)) %>%
mutate(play_id = ifelse(turnover == 0 & as.numeric(row_number()) == 1, 1, turnover)) %>%
ungroup() %>% mutate(play_id = cumsum(play_id))
# A tibble: 20 x 4
x      y turnover play_id
<dbl>  <dbl>    <dbl>   <dbl>
1 0.446  0.566         1       1
2 0.395  0.0427        1       2
3 0.484  0.488         1       3
4 0.919  0.351         0       4
5 0.844  0.970         0       4
6 0.517  0.790         0       4
7 0.437  0.597         1       5
8 0.343  0.357         1       6
9 0.0155 0.759         1       7
10 0.118  0.261         1       8
11 0.691  0.494         0       9
12 0.260  0.793         1      10
13 0.225  0.443         1      11
14 0.342  0.333         1      12
15 0.782  0.626         0      13
16 0.843  0.136         0      13
17 0.775  0.549         0      13
18 0.387  0.922         1      14
19 0.136  0.276         1      15
20 0.900  0.917         0      16

只有baseR

df$play_id <- cumsum(replace(df$turnover, with(rle(df$turnover != 0), 1+cumsum(lengths)[values]), 1))
x          y turnover play_id
1  0.44594048 0.56645266        1       1
2  0.39465031 0.04273416        1       2
3  0.48372887 0.48831925        1       3
4  0.91887596 0.35122322        0       4
5  0.84388144 0.96966171        0       4
6  0.51734962 0.78957889        0       4
7  0.43712500 0.59663026        1       5
8  0.34319822 0.35667053        1       6
9  0.01551696 0.75870834        1       7
10 0.11799116 0.26105077        1       8
11 0.69098590 0.49398192        0       9
12 0.26048568 0.79289819        1      10
13 0.22505121 0.44299896        1      11
14 0.34238622 0.33326556        1      12
15 0.78188794 0.62576846        0      13
16 0.84324669 0.13619319        0      13
17 0.77474887 0.54868278        0      13
18 0.38719298 0.92183168        1      14
19 0.13576507 0.27603364        1      15
20 0.90035758 0.91741393        0      16

以下是data.table的一种方法:

library(data.table)
setDT(df)[, tmp := if(first(turnover) == 0) 
replace(turnover, 1, 1) else turnover, rleid(turnover)]
df[, result := cumsum(tmp)]
df[, tmp := NULL]
df
#             x          y turnover play_id result
# 1: 0.44594048 0.56645266        1       1      1
# 2: 0.39465031 0.04273416        1       2      2
# 3: 0.48372887 0.48831925        1       3      3
# 4: 0.91887596 0.35122322        0       4      4
# 5: 0.84388144 0.96966171        0       4      4
# 6: 0.51734962 0.78957889        0       4      4
# 7: 0.43712500 0.59663026        1       5      5
# 8: 0.34319822 0.35667053        1       6      6
# 9: 0.01551696 0.75870834        1       7      7
#10: 0.11799116 0.26105077        1       8      8
#11: 0.69098590 0.49398192        0       9      9
#12: 0.26048568 0.79289819        1      10     10
#13: 0.22505121 0.44299896        1      11     11
#14: 0.34238622 0.33326556        1      12     12
#15: 0.78188794 0.62576846        0      13     13
#16: 0.84324669 0.13619319        0      13     13
#17: 0.77474887 0.54868278        0      13     13
#18: 0.38719298 0.92183168        1      14     14
#19: 0.13576507 0.27603364        1      15     15
#20: 0.90035758 0.91741393        0      16     16

对于turnover值为0的组,我们将该组的第一个值改为1,并取其累加和。

最新更新