
  • 本文关键字:var 索引 r date data.table dplyr
  • 更新时间 :
  • 英文 :

我有一个数据集,每个id都有一系列日期。我已经生成了一系列领先和滞后变量,现在我想生成另一组变量,每行领先和滞后的变量之间的天数差。当我生成超前和滞后变量时,我使用paste0为每个变量名称附加一个数字。例如,滞后变量命名为prev_date1:prev_date 20。我希望能够在生成另一组变量时使用这些数字来计算两人之间的天数差异。一般形式如下:diff2prev[i] = prev_date[i-1] - prev_date[i]


if (!require('pacman')) install.packages('pacman'); library(pacman) 
#> Loading required package: pacman
p_load("dplyr", "lubridate","tidyverse")
id <- c(13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15)
date <- c("2017-06-06", "2017-07-26", "2017-09-22", "2017-10-21", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2019-05-12", "2019-06-07", "2019-10-08","2016-10-20", "2016-10-20", "2016-10-20", "2016-10-20", "2018-01-06", "2018-01-06", "2018-01-06", "2018-01-06", "2018-01-06","2018-01-06", "2018-05-02", "2018-08-04", "2018-08-04", "2018-08-04", "2018-11-22", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2019-05-11","2019-06-04", "2019-11-18", "2016-04-01", "2018-04-04", "2019-04-03", "2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04","2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04", "2019-06-03", "2019-06-04", "2019-11-23")
sample <- bind_cols(id, date)
#> New names:
#> * NA -> ...1
#> * NA -> ...2
colnames <- c("id", "date")
names(sample) <- colnames
sample <- sample %>% 
group_by(id) %>% 
mutate(date = as_date(date))
#Using data.table shift/lag to create 20 prev dates
sample[,paste0('prev_date', 1:20) := shift(date, 1:20, type="lag"), by = id][]

#Using data.table shift/lead to create 20 prev dates

sample[,paste0('next_date', 1:20) := shift(date, 1:20, type="lead"), by = id][]  


## Dplyr approach to writing out each new variable
##This works but seems inefficient
sample <- sample %>%
group_by(id) %>%
mutate(diff2prev = date - prev_date,
diff2prev1 = prev_date - prev_date1,
diff2prev2 = prev_date1 - prev_date2,
diff2prev3 = prev_date2 - prev_date3,
diff2prev4 = prev_date3 - prev_date4,
diff2prev5 = prev_date4 - prev_date5,
diff2prev6 = prev_date5 - prev_date6,
diff2prev7 = prev_date6 - prev_date7,
diff2next = next_date - date,
diff2next1 = next_date1 - next_date,
diff2next2 = next_date2 - next_date1,
diff2next3 = next_date3 - next_date2,
diff2next4 = next_date4 - next_date3,
diff2next5 = next_date5 - next_date4,
diff2next6 = next_date6 - next_date5,
diff2next7 = next_date7 - next_date6)
##Attempt at using data.table to generate variables but not sure how to incorporate the length of [i] for iteration
setDT(pid_ell)[,paste0('diff2prev', 1:20) := (diff2prev[i] = prev_date[i-1] - prev_date[i], 1:20), by = id][]
##Attempt to create a function that would create the new empty variables and then fill them in
#function to create variable calculating the difference in days to the previous date
fn_diff2prev <- function(date, prev_date) {
for (i in 2:lead_lag){
diff2prev[i] <- paste0('diff2prev', 1:20) # new var names
diff2prev1 <- date - prev_date1 #first one calculates from date
for (i in 2:lead_lag){
diff2prev[i] <- prev_date[i-1] - prev_date[i] #others calculate based on [i]


sample[,c(paste0('date2prev', 1:20), paste0('date2next', 1:20)) := {
days = c(NA, diff(date))
c(shift(days, 0:19), shift(days, -1:-20)) 
}, by = id]


id prev_date2 prev_date1       date next_date1 next_date2 date2prev1 date2prev2 date2next1 date2next2
1: 13       <NA>       <NA> 2017-06-06 2017-07-26 2017-09-22         NA         NA         50         58
2: 13       <NA> 2017-06-06 2017-07-26 2017-09-22 2017-10-21         50         NA         58         29
3: 13 2017-06-06 2017-07-26 2017-09-22 2017-10-21 2018-03-29         58         50         29        159
4: 13 2017-07-26 2017-09-22 2017-10-21 2018-03-29 2018-03-29         29         58        159          0
5: 13 2017-09-22 2017-10-21 2018-03-29 2018-03-29 2018-03-29        159         29          0          0
6: 13 2017-10-21 2018-03-29 2018-03-29 2018-03-29 2018-03-29          0        159          0          0
7: 13 2018-03-29 2018-03-29 2018-03-29 2018-03-29 2018-03-29          0          0          0          0
8: 13 2018-03-29 2018-03-29 2018-03-29 2018-03-29 2018-03-29          0          0          0          0
9: 13 2018-03-29 2018-03-29 2018-03-29 2018-03-29 2018-03-29          0          0          0          0
10: 13 2018-03-29 2018-03-29 2018-03-29 2018-03-29 2018-03-29          0          0          0          0
11: 13 2018-03-29 2018-03-29 2018-03-29 2018-03-29 2018-03-29          0          0          0          0
12: 13 2018-03-29 2018-03-29 2018-03-29 2018-03-29 2018-03-29          0          0          0          0
13: 13 2018-03-29 2018-03-29 2018-03-29 2018-03-29 2018-03-29          0          0          0          0
14: 13 2018-03-29 2018-03-29 2018-03-29 2018-03-29 2019-05-12          0          0          0        409
15: 13 2018-03-29 2018-03-29 2018-03-29 2019-05-12 2019-06-07          0          0        409         26
16: 13 2018-03-29 2018-03-29 2019-05-12 2019-06-07 2019-10-08        409          0         26        123
17: 13 2018-03-29 2019-05-12 2019-06-07 2019-10-08       <NA>         26        409        123         NA
18: 13 2019-05-12 2019-06-07 2019-10-08       <NA>       <NA>        123         26         NA         NA
19: 14       <NA>       <NA> 2016-10-20 2016-10-20 2016-10-20         NA         NA          0          0
20: 14       <NA> 2016-10-20 2016-10-20 2016-10-20 2016-10-20          0         NA          0          0


N <- 20
# define the column you want to set and the x & y such that the difference
# is x - y
ncols <- paste0("diff2prev", 1:N)
x_cols <- c("date", paste0("prev_date", 1:(N-1)))
y_cols <- paste0("prev_date", 1:N)
# loop and use data.table set
for(i in 1:N){
j = ncols[i],
value = sample[[x_cols[i]]] - sample[[y_cols[i]]])


#Create data.table
sample <- data.table(id, date)
#Create next and previous dates
sample[,paste0('prev_date', 1:5) := shift(date, 1:5, type="lag"), by = id][]
sample[,paste0('next_date', 1:5) := shift(date, 1:5, type="lag"), by = id][]
#Create vectors of next and previous column names along with "date" column
p1 <- c('date', grep('prev_date', names(sample), value = TRUE))
n1 <- c('date', grep('next_date', names(sample), value = TRUE))
#Create new columns for the dataframe
new_p1 <- paste0('new_prev', seq_along(p1[-1]))
new_n1 <- paste0('new_next', seq_along(n1[-1]))
#Convert to dataframe
#Perform subtract of the columns. 
sample[new_p1] <- sample[p1[-length(p1)]] - sample[p1[-1]]
sample[new_n1] <- sample[n1[-1]] - sample[n1[-length(n1)]]


  • 没有找到相关文章
