r语言 - 特定于组的变量,用于跟踪满足条件时的最后一个观测值



我正在尝试构造一个特定于组的变量,即:

  1. NA 表示在满足条件之前进行的观测。
  2. 满足条件时的第一个观察值为 NA。
  3. 对于后续观测值,为满足条件时最后一个先前观测值的值。

下面是一个MWE。我正在寻找构造"last_value"的代码。我更喜欢使用 data.table(或 dplyr(来实现可扩展性/性能,但对其他建议持开放态度。谢谢!

library(data.table)
data.table(
group = c(rep("alpha", 4), rep("beta", 3)),
time = c(1:4, 1:3),
condition = c("Yes", "No", "Yes", "No", "No", "Yes", "No"),
value = 1:7,
last_value = c(NA, 1, 1, 3, NA, NA, 6))
#    group time condition value last_value
# 1: alpha    1       Yes     1         NA
# 2: alpha    2        No     2          1
# 3: alpha    3       Yes     3          1
# 4: alpha    4        No     4          3
# 5:  beta    1        No     5         NA
# 6:  beta    2       Yes     6         NA
# 7:  beta    3        No     7          6
# last_value is:
#     NA in the 1st row as that is the 1st observation for group "alpha"
#     1 in the 2nd row as the 1st observation is condition = "Yes"
#     1 in the 3rd row as the 1st observation is condition = "Yes"; 2nd observation is condition = "No"
#     3 in the 4rd row as the 3rd observation is condition = "Yes"
#     NA in the 5th row as that is the 1st observation for group "beta"
#     NA in the 6th row as there is no prior observation with condition = "Yes"
#     6 in the 7th row as the 6th observation is condition = "Yes"

这是dplyr中使用cummax的另一个。

library(dplyr)
df %>%
group_by(group) %>%
mutate(last = cummax(row_number() * (condition == "Yes")),
last = lag(value[replace(last, last == 0, NA)]))

#  group  time condition value last_value  last
#  <fct> <int> <fct>     <int>      <dbl> <int>
#1 alpha     1 Yes           1         NA    NA
#2 alpha     2 No            2          1     1
#3 alpha     3 Yes           3          1     1
#4 alpha     4 No            4          3     3
#5 beta      1 No            5         NA    NA
#6 beta      2 Yes           6         NA    NA
#7 beta      3 No            7          6     6

借此机会玩了一下data.table的一些最新添加fifelse()nafill()

DT[order(time), # necessary if data is note in order
last_value2 := nafill(shift(fifelse(condition == "Yes", value, NA_integer_)), "locf"),
by = group
]
group time condition value last_value last_value2
1: alpha    1       Yes     1         NA          NA
2: alpha    2        No     2          1           1
3: alpha    3       Yes     3          1           1
4: alpha    4        No     4          3           3
5:  beta    1        No     5         NA          NA
6:  beta    2       Yes     6         NA          NA
7:  beta    3        No     7          6           6

或带管道:

DT[, 
last_value2 := 
fifelse(condition == "Yes", value, NA_integer_) %>% 
shift() %>% 
nafill("locf"),
by = group
]

以下是使用data.table的一些选项:

1.使用非等值联接

DT[, lv := 
DT[condition=="Yes"][.SD, on=.(group, time<time), x.value, by=.EACHI, mult="last"]$x.value
]

2.将序列切成以 TRUE 开头的组

DT2[, lv := if(condition[1L]) value[1L], .(group, cumsum(condition))][,
lv := shift(lv), group]

3.在数据表中滚动连接(应该是最快的(

DT3[, c("lv", "t2") := .(NA_integer_, shift(time))]
DT3[group!=shift(group), t2 := NA_integer_]
DT3[, lv := DT3[(condition)][.SD, on=.(group, time=t2), roll=Inf, x.value]]

4.data.table的开发版本中,你应该能够使用DT[, lv2 := shift(nafill(replace(value, condition=="No", NA_integer_)), group]。警告:由于无法升级,因此无法测试。

数据:

library(data.table)
DT <- data.table(
group = c(rep("alpha", 4), rep("beta", 3)),
time = c(1:4, 1:3),
condition = c(T, F, T, F, F, T, F),
value = 1:7,
last_value = c(NA, 1, 1, 3, NA, NA, 6))
DT2 <- copy(DT); DT3 <- copy(DT);

输出:

group time condition value last_value lv
1: alpha    1      TRUE     1         NA NA
2: alpha    2     FALSE     2          1  1
3: alpha    3      TRUE     3          1  1
4: alpha    4     FALSE     4          3  3
5:  beta    1     FALSE     5         NA NA
6:  beta    2      TRUE     6         NA NA
7:  beta    3     FALSE     7          6  6

计时代码:

library(data.table)
set.seed(0L)
ng <- 1e6
nr <- 1e7
DT <- data.table(group=sort(sample(ng, nr, TRUE)))
DT[, c("time", "condition", "value") := .(rowid(group), sample(c(TRUE, FALSE), nr, TRUE), .I)]
DT2 <- copy(DT)
DT3 <- copy(DT)
mtd0 <- function() {
DT[, lv :=
DT[(condition)][.SD, on=.(group, time<time), x.value, by=.EACHI, mult="last"]$x.value
]
}
mtd1 <- function() {
DT2[, lv := if(condition[1L]) value[1L], .(group, cumsum(condition))][,
lv := shift(lv), group]
}
mtd2 <- function() {
DT3[, c("lv", "t2") := .(NA_integer_, shift(time))]
DT3[group!=shift(group), t2 := NA_integer_]
DT3[, lv := DT3[(condition)][.SD, on=.(group, time=t2), roll=Inf, x.value]]
}
bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)

计时:

# A tibble: 3 x 13
expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result     memory   time  gc     
<bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>     <list>   <lis> <list> 
1 mtd0()       5.24s   5.24s    0.191      485MB    0.191     1     1      5.24s <df[,5] [~ <df[,3]~ <bch~ <tibbl~
2 mtd1()      48.55s  48.55s    0.0206     240MB    9.19      1   446     48.55s <df[,5] [~ <df[,3]~ <bch~ <tibbl~
3 mtd2()       1.28s   1.28s    0.780      618MB    0         1     0      1.28s <df[,6] [~ <df[,3]~ <bch~ <tibbl~

这是一种dplyr的方法,其中所需的last_value输出在last_value2中忠实地生成。

library(dplyr)
library(tidyr)
df %>%
group_by(group) %>%
mutate(value2 = if_else(condition == "Yes", value, NA_integer_)) %>%
tidyr::fill(value2) %>%
mutate(last_value2 = lag(value2)) %>%
ungroup()
## A tibble: 7 x 7
#  group  time condition value last_value value2 last_value2
#  <fct> <int> <fct>     <int>      <dbl>  <int>       <int>
#1 alpha     1 Yes           1         NA      1          NA
#2 alpha     2 No            2          1      1           1
#3 alpha     3 Yes           3          1      3           1
#4 alpha     4 No            4          3      3           3
#5 beta      1 No            5         NA     NA          NA
#6 beta      2 Yes           6         NA      6          NA
#7 beta      3 No            7          6      6           6

假设此处加载的数据:

df <- data.frame(
group = c(rep("alpha", 4), rep("beta", 3)),
time = c(1:4, 1:3),
condition = c("Yes", "No", "Yes", "No", "No", "Yes", "No"),
value = 1:7,
last_value = c(NA, 1, 1, 3, NA, NA, 6))

最新更新