r-检查日期间隔是否重叠



我有一个下面的数据帧,其中每个客户端都有不同的程序周期。我正在尝试创建一个循环,检查客户是否在选定的时间段(每月频率(参与了某个项目。

Client <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L") 
EntryDate <- as.Date(c("2004-01-01","2004-02-11", "2004-03-21","2004-04-14","2004-05-22","2004-05-08","2004-07-20",
"2004-07-28","2004-09-30","2004-10-25","2004-11-28","2004-12-30"))
ExitDate <- as.Date(c("2005-01-22","2004-03-01", "2004-05-21","2004-04-28","2004-08-09","2004-08-08","2005-06-20",
"2004-10-28","2004-10-30","2004-11-01","2004-12-28","2005-12-30"))
df <- data.frame(Client,EntryDate, ExitDate)
df$interval_client = interval(df$EntryDate,df$ExitDate)
Client  EntryDate   ExitDate                interval_client
1       A 2004-01-01 2005-01-22 2004-01-01 UTC--2005-01-22 UTC
2       B 2004-02-11 2004-03-01 2004-02-11 UTC--2004-03-01 UTC
3       C 2004-03-21 2004-05-21 2004-03-21 UTC--2004-05-21 UTC
4       D 2004-04-14 2004-04-28 2004-04-14 UTC--2004-04-28 UTC
5       E 2004-05-22 2004-08-09 2004-05-22 UTC--2004-08-09 UTC
6       F 2004-05-08 2004-08-08 2004-05-08 UTC--2004-08-08 UTC
7       G 2004-07-20 2005-06-20 2004-07-20 UTC--2005-06-20 UTC
8       H 2004-07-28 2004-10-28 2004-07-28 UTC--2004-10-28 UTC
9       I 2004-09-30 2004-10-30 2004-09-30 UTC--2004-10-30 UTC
10      J 2004-10-25 2004-11-01 2004-10-25 UTC--2004-11-01 UTC
11      K 2004-11-28 2004-12-28 2004-11-28 UTC--2004-12-28 UTC
12      L 2004-12-30 2005-12-30 2004-12-30 UTC--2005-12-30 UTC

基本上,我想看看某个客户在哪个月(从2004年1月到2005年12月(参加了一个项目。我尝试了不同的代码,但都没有得到所需的结果。

years <- 2004:2005
months <- 1:12
# option 1
for (y in years){
for (m in months){
df[ ,paste0(y, '-', m)] <- ifelse(df$interval_client %within% interval(ymd(paste0(y, '-', m, '-', 1)), ceiling_date(ymd(paste0(y, '-', m, '-', 1)),"month")-1), 1, 0)
}
}
# option 2
for (y in years){
for (m in months){
df[ ,paste0(y, '-', m)] <- ifelse(df$interval_client >= as.Date(paste0(y, '-', m, '-', 1), '%Y-%m-%d') & df$interval_client <= ceiling_date(as.Date(paste0(y, '-', m, '-', 1), '%Y-%m%d'),"month")-1 ,1, 0)
}
}

我想要这样的东西(只显示前3行(

Client  EntryDate   ExitDate                interval_client   2004-1 2004-2 2004-3 2004-4 2004-5 2004-6 2004-7 2004-8 2004-9
1      A 2004-01-01 2005-01-22 2004-01-01 UTC--2005-01-22 UTC      1      1      1      1      1      1      1      1      1
2      B 2004-02-11 2004-03-01 2004-02-11 UTC--2004-03-01 UTC      0      1      1      0      0      0      0      0      0
3      C 2004-03-21 2004-05-21 2004-03-21 UTC--2004-05-21 UTC      0      0      1      1      1      0      0      0      0
2004-10 2004-11 2004-12 2005-1 2005-2 2005-3 2005-4 2005-5 2005-6 2005-7 2005-8 2005-9 2005-10  2005-11 2005-12
1       1       1       1      1      0      0       0       0       0       0       0      0       0       0       0
2       0       0       0      0      0      0       0       0       0       0       0      0       0       0       0
3       0       0       0      0      0      0       0       0       0       0       0      0       0       0       0

这里是data.table的另一种方法,应该会给出类似的结果。您可以从进入和退出日期创建一个seq月份的影响(使用floor_date从该月的第一天开始(。然后使用dcast将其放入年月的宽格式中。

library(lubridate)
library(data.table)
setDT(df)[ , list(Client = Client, month = format(seq(floor_date(EntryDate, "month"), floor_date(ExitDate, "month"), by = "month"), format = "%Y-%m")), by = 1:nrow(df)][
, dcast(.SD, Client ~ month, fun = length)
]

输出

Client 2004-01 2004-02 2004-03 2004-04 2004-05 2004-06 2004-07 2004-08 2004-09 2004-10 2004-11 2004-12 2005-01 2005-02 2005-03 2005-04 2005-05 2005-06 2005-07 2005-08 2005-09
1:      A       1       1       1       1       1       1       1       1       1       1       1       1       1       0       0       0       0       0       0       0       0
2:      B       0       1       1       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0
3:      C       0       0       1       1       1       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0
4:      D       0       0       0       1       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0
5:      E       0       0       0       0       1       1       1       1       0       0       0       0       0       0       0       0       0       0       0       0       0
6:      F       0       0       0       0       1       1       1       1       0       0       0       0       0       0       0       0       0       0       0       0       0
7:      G       0       0       0       0       0       0       1       1       1       1       1       1       1       1       1       1       1       1       0       0       0
8:      H       0       0       0       0       0       0       1       1       1       1       0       0       0       0       0       0       0       0       0       0       0
9:      I       0       0       0       0       0       0       0       0       1       1       0       0       0       0       0       0       0       0       0       0       0
10:      J       0       0       0       0       0       0       0       0       0       1       1       0       0       0       0       0       0       0       0       0       0
11:      K       0       0       0       0       0       0       0       0       0       0       1       1       0       0       0       0       0       0       0       0       0
12:      L       0       0       0       0       0       0       0       0       0       0       0       1       1       1       1       1       1       1       1       1       1
2005-10 2005-11 2005-12
1:       0       0       0
2:       0       0       0
3:       0       0       0
4:       0       0       0
5:       0       0       0
6:       0       0       0
7:       0       0       0
8:       0       0       0
9:       0       0       0
10:       0       0       0
11:       0       0       0
12:       1       1       1

这可能是一个选项。我不需要interval_client,只需要在ExitDateEntryDate之间设置一个条件。

for (y in years){
for (m in months){
df[ ,paste0(y, '-', m)] <- ifelse(ExitDate >= ym(paste0(y, '-', m)) & 
EntryDate <= ym(paste0(y, '-', m)), 
1, 0)
}
}
#Client  EntryDate   ExitDate                interval_client 2004-1 2004-2 2004-3 2004-4 2004-5 2004-6 2004-7 2004-8 2004-9 2004-10
#1       A 2004-01-01 2005-01-22 2004-01-01 UTC--2005-01-22 UTC      1      1      1      1      1      1      1      1      1       1
#2       B 2004-02-11 2004-03-01 2004-02-11 UTC--2004-03-01 UTC      0      0      1      0      0      0      0      0      0       0
#3       C 2004-03-21 2004-05-21 2004-03-21 UTC--2004-05-21 UTC      0      0      0      1      1      0      0      0      0       0
#4       D 2004-04-14 2004-04-28 2004-04-14 UTC--2004-04-28 UTC      0      0      0      0      0      0      0      0      0       0
#5       E 2004-05-22 2004-08-09 2004-05-22 UTC--2004-08-09 UTC      0      0      0      0      0      1      1      1      0       0
#6       F 2004-05-08 2004-08-08 2004-05-08 UTC--2004-08-08 UTC      0      0      0      0      0      1      1      1      0       0
#7       G 2004-07-20 2005-06-20 2004-07-20 UTC--2005-06-20 UTC      0      0      0      0      0      0      0      1      1       1
#8       H 2004-07-28 2004-10-28 2004-07-28 UTC--2004-10-28 UTC      0      0      0      0      0      0      0      1      1       1
#9       I 2004-09-30 2004-10-30 2004-09-30 UTC--2004-10-30 UTC      0      0      0      0      0      0      0      0      0       1
#10      J 2004-10-25 2004-11-01 2004-10-25 UTC--2004-11-01 UTC      0      0      0      0      0      0      0      0      0       0
#11      K 2004-11-28 2004-12-28 2004-11-28 UTC--2004-12-28 UTC      0      0      0      0      0      0      0      0      0       0
#12      L 2004-12-30 2005-12-30 2004-12-30 UTC--2005-12-30 UTC      0      0      0      0      0      0      0      0      0       0

最新更新