这是我在这里问的第一个问题,我希望能正确回答!
我有一个包含数百万次观测的数据集。每一行都是不同个人在不同日期提取的药物处方,每个人在数据帧中多次出现。
library(dplyr)
set.seed(42)
ID <- sample(c("ID1", "ID2", "ID3", "ID4", "ID5", "ID6", "ID7", "ID8", "ID9", "ID10"), 40, replace = T)
prescription_date <- sample(seq(as.Date('1999/01/01'), as.Date('2010/01/01'), by="month"), 40)
switch <- sample(c(0, 1), 40, replace = T, prob = c(0.4, 0.6))
df <- data.frame(ID, prescription_date, switch) %>% group_by(ID)
df %>% arrange(ID) %>% print(n=40)
#> # A tibble: 40 x 3
#> # Groups: ID [10]
#> ID prescription_date switch
#> <fct> <date> <dbl>
#> 1 ID1 2007-03-01 1
#> 2 ID1 1999-06-01 0
#> 3 ID1 1999-02-01 1
#> 4 ID1 2006-09-01 0
#> 5 ID10 2008-08-01 0
#> 6 ID10 2000-09-01 1
#> 7 ID10 2001-09-01 1
#> 8 ID10 2001-11-01 1
#> 9 ID10 2000-04-01 1
#> 10 ID10 2004-09-01 1
#> 11 ID2 2008-10-01 1
#> 12 ID2 2003-01-01 0
#> 13 ID2 2005-12-01 0
#> 14 ID2 2000-06-01 0
#> 15 ID3 2007-07-01 1
#> 16 ID3 2007-11-01 0
#> 17 ID4 1999-03-01 1
#> 18 ID4 2003-10-01 0
#> 19 ID4 1999-05-01 1
#> 20 ID4 2007-10-01 1
#> 21 ID4 2005-04-01 0
#> 22 ID4 2009-05-01 1
#> 23 ID4 2005-10-01 0
#> 24 ID4 2003-07-01 0
#> 25 ID5 2008-06-01 1
#> 26 ID5 2002-04-01 1
#> 27 ID5 2005-01-01 0
#> 28 ID5 2001-05-01 0
#> 29 ID5 2009-09-01 1
#> 30 ID6 2006-08-01 0
#> 31 ID6 2000-12-01 0
#> 32 ID7 2007-06-01 0
#> 33 ID8 2008-11-01 1
#> 34 ID8 1999-09-01 0
#> 35 ID8 2007-05-01 0
#> 36 ID8 2009-03-01 1
#> 37 ID9 2009-10-01 0
#> 38 ID9 1999-10-01 1
#> 39 ID9 2007-04-01 0
#> 40 ID9 2008-01-01 0
创建于2021-06-19由reprex包(v0.3.0(
变量";开关";指示个体是否相对于先前的处方改变了该处方中的药物。我需要知道每个人第三次换药的日期。然而,我遇到了一段困难,因为我似乎无法创建变量"的迭代求和;开关";在每次观察中。只要设法创建类似的东西就足够了:
#> # A tibble: 40 x 3
#> # Groups: ID [10]
#> ID prescription_date switch date3switch
#> <fct> <date> <dbl> <dbl>
#> 1 ID1 1999-02-01 1 1
#> 2 ID1 1999-06-01 0 NA
#> 3 ID1 2006-09-01 0 NA
#> 4 ID1 2007-03-01 1 2
#> 5 ID10 2000-04-01 1 1
#> 6 ID10 2000-09-01 1 2
#> 7 ID10 2001-09-01 1 3
#> 8 ID10 2001-11-01 1 4
#> 9 ID10 2004-09-01 1 5
#> 10 ID10 2008-08-01 0 NA
#> 11 ID2 2000-06-01 0 NA
#> 12 ID2 2003-01-01 0 NA
#> 13 ID2 2005-12-01 0 NA
#> 14 ID2 2008-10-01 1 1
#> 15 ID3 2007-07-01 1 1
#> 16 ID3 2007-11-01 0 NA
#> 17 ID4 1999-03-01 1 1
#> 18 ID4 1999-05-01 1 2
#> 19 ID4 2003-07-01 0 NA
#> 20 ID4 2003-10-01 0 NA
#> 21 ID4 2005-04-01 0 NA
#> 22 ID4 2005-10-01 0 NA
#> 23 ID4 2007-10-01 1 3
#> 24 ID4 2009-05-01 1 4
我试着创建一个for循环,但我想这对我的初学者来说太高级了,因为我只创建了一个NULL数据帧。。
df <- for (i in 1:dim(df)[1]) {
if(sum(data$switch) == 3)
{ mutate(date3switch == prescribed_date)}
else NA
}
创建于2021-06-19由reprex包(v0.3.0(
我感谢你的帮助!
library(dplyr)
set.seed(42)
ID <- sample(c("ID1", "ID2", "ID3", "ID4", "ID5", "ID6", "ID7", "ID8", "ID9", "ID10"), 40, replace = T)
prescription_date <- sample(seq(as.Date('1999/01/01'), as.Date('2010/01/01'), by="month"), 40)
switch <- sample(c(0, 1), 40, replace = T, prob = c(0.4, 0.6))
df <- data.frame(ID, prescription_date, switch) %>% group_by(ID)
df %>% group_by(ID) %>%
arrange(prescription_date, .by_group = T) %>%
mutate(switch2 = ifelse(switch == 0, NA, cumsum(switch))) %>%
print(n = 40)
#> # A tibble: 40 x 4
#> # Groups: ID [10]
#> ID prescription_date switch switch2
#> <chr> <date> <dbl> <dbl>
#> 1 ID1 1999-02-01 1 1
#> 2 ID1 1999-06-01 0 NA
#> 3 ID1 2006-09-01 0 NA
#> 4 ID1 2007-03-01 1 2
#> 5 ID10 2000-04-01 1 1
#> 6 ID10 2000-09-01 1 2
#> 7 ID10 2001-09-01 1 3
#> 8 ID10 2001-11-01 1 4
#> 9 ID10 2004-09-01 1 5
#> 10 ID10 2008-08-01 0 NA
#> 11 ID2 2000-06-01 0 NA
#> 12 ID2 2003-01-01 0 NA
#> 13 ID2 2005-12-01 0 NA
#> 14 ID2 2008-10-01 1 1
#> 15 ID3 2007-07-01 1 1
#> 16 ID3 2007-11-01 0 NA
#> 17 ID4 1999-03-01 1 1
#> 18 ID4 1999-05-01 1 2
#> 19 ID4 2003-07-01 0 NA
#> 20 ID4 2003-10-01 0 NA
#> 21 ID4 2005-04-01 0 NA
#> 22 ID4 2005-10-01 0 NA
#> 23 ID4 2007-10-01 1 3
#> 24 ID4 2009-05-01 1 4
#> 25 ID5 2001-05-01 0 NA
#> 26 ID5 2002-04-01 1 1
#> 27 ID5 2005-01-01 0 NA
#> 28 ID5 2008-06-01 1 2
#> 29 ID5 2009-09-01 1 3
#> 30 ID6 2000-12-01 0 NA
#> 31 ID6 2006-08-01 0 NA
#> 32 ID7 2007-06-01 0 NA
#> 33 ID8 1999-09-01 0 NA
#> 34 ID8 2007-05-01 0 NA
#> 35 ID8 2008-11-01 1 1
#> 36 ID8 2009-03-01 1 2
#> 37 ID9 1999-10-01 1 1
#> 38 ID9 2007-04-01 0 NA
#> 39 ID9 2008-01-01 0 NA
#> 40 ID9 2009-10-01 0 NA
创建于2021-06-19由reprex包(v2.0.0(
使用cumsum
将很有帮助,并将switch = 0
的值替换为NA
。
library(dplyr)
df %>%
arrange(ID) %>%
group_by(ID) %>%
mutate(date3switch = cumsum(switch),
date3switch = replace(date3switch, switch == 0, NA)) %>%
ungroup
# ID prescription_date switch date3switch
# <chr> <date> <dbl> <dbl>
# 1 ID1 2007-03-01 1 1
# 2 ID1 1999-06-01 0 NA
# 3 ID1 1999-02-01 1 2
# 4 ID1 2006-09-01 0 NA
# 5 ID10 2008-08-01 0 NA
# 6 ID10 2000-09-01 1 1
# 7 ID10 2001-09-01 1 2
# 8 ID10 2001-11-01 1 3
# 9 ID10 2000-04-01 1 4
#10 ID10 2004-09-01 1 5
# … with 30 more rows
我们可以使用na_if
library(dplyr)
df %>%
arrange(ID) %>%
group_by(ID) %>%
mutate(date3switch = na_if(cumsum(switch), 0))