我有两个数据。我想根据一个表中的日期是否在另一个表中的给定时间范围内来组合这些表。在dt1中我有退出日期,我想在dt2中检查哪些值在每个ID的退出日期是有效的。
dt1 <- data.table (ID = 1:10,
exit = c("31/12/2010", "01/01/2021", "30/09/2010", "31/12/2015", "30/09/2010","31/10/2018", "01/02/2016", "01/05/2015", "01/09/2013", "01/01/2016"))
dt2 <- data.table (ID = c(1,2,2,2,3,5,6,6,7,8,8,9,10),
valid_from = c("01/01/2010", "01/01/2012", "01/01/2013", "01/12/2017", "01/05/2010", "01/04/2010", "01/05/2014", "01/11/2016", "01/01/2016", "15/04/2013", "01/01/2015", "15/02/2010", "01/04/2012"),
valid_until = c("01/01/2021", "31/12/2012", "30/11/2017", "01/01/2021", "01/01/2021", "01/01/2021", "31/10/2016", "01/01/2021", "01/01/2021", "31/12/2014", "01/05/2015", "01/01/2013", "01/01/2021"),
text1 = c("a", "a", "b", "c", "b", "b", "c", "a", "a", "b", "a", "c", "a"),
text2 = c("I", "I", "II", "I", "III", "I", "II", "III", "I", "II", "II", "I", "III" ))
ID exit
1: 1 31/12/2010
2: 2 01/01/2021
3: 3 30/09/2010
4: 4 31/12/2015
5: 5 30/09/2010
6: 6 31/10/2018
7: 7 01/02/2016
8: 8 01/05/2015
9: 9 01/09/2013
10: 10 01/01/2016
ID valid_from valid_until text1 text2
1: 1 01/01/2010 01/01/2021 a I
2: 2 01/01/2012 31/12/2012 a I
3: 2 01/01/2013 30/11/2017 b II
4: 2 01/12/2017 01/01/2021 c I
5: 3 01/05/2010 01/01/2021 b III
6: 5 01/04/2010 01/01/2021 b I
7: 6 01/05/2014 31/10/2016 c II
8: 6 01/11/2016 01/01/2021 a III
9: 7 01/01/2016 01/01/2021 a I
10: 8 15/04/2013 31/12/2014 b II
11: 8 01/01/2015 01/05/2015 a II
12: 9 15/02/2010 01/01/2013 c I
13: 10 01/04/2012 01/01/2021 a III
因此,我想在dt1中返回退出日期的有效值。如果在dt2中没有找到ID(示例数据中的ID 4就是这种情况),它应该返回NA。
ID exit text1 text2
1: 1 31/12/2010 a I
2: 2 01/01/2021 c I
3: 3 30/09/2010 b III
4: 4 31/12/2015 <NA> <NA>
5: 5 30/09/2010 b I
6: 6 31/10/2018 a III
7: 7 01/02/2016 a I
8: 8 01/05/2015 a II
9: 9 01/09/2013 c I
10: 10 01/01/2016 a III
谁能帮我解决这个问题?由于输入是data.table
,请考虑使用快速的data.table
方法
library(data.table)
# // convert the date columns to `Date` class
dt1[, exit := as.IDate(exit, '%d/%m/%Y')]
dt2[, c('valid_from', 'valid_until') := .(as.IDate(valid_from, '%d/%m/%Y'),
as.IDate(valid_until, '%d/%m/%Y'))]
# // do a non-equi join
dt1[dt2, c('text1', 'text2') := .(i.text1, i.text2),
on = .(ID, exit >= valid_from, exit <= valid_until)]
与产出
> dt1
ID exit text1 text2
1: 1 2010-12-31 a I
2: 2 2021-01-01 c I
3: 3 2010-09-30 b III
4: 4 2015-12-31 <NA> <NA>
5: 5 2010-09-30 b I
6: 6 2018-10-31 a III
7: 7 2016-02-01 a I
8: 8 2015-05-01 a II
9: 9 2013-09-01 <NA> <NA>
10: 10 2016-01-01 a III
这是一个dplyr
解决方案,这是在@akrun的帮助下创建的:见这里的日期:尚未实现NAbounds=TRUE用于此非数字和非字符类型
library(dplyr)
libray(lubridate)
df1 <- left_join(dt1, dt2, by="ID") %>%
mutate(across(c(exit, valid_from, valid_until), dmy)) %>%
rowwise() %>%
mutate(match= +(dplyr::between(exit, valid_from, valid_until))) %>%
group_by(ID) %>%
filter(match==max(match) | is.na(match)) %>%
select(ID, exit, text1, text2) %>%
ungroup()
输出:
ID exit text1 text2
<dbl> <date> <chr> <chr>
1 1 2010-12-31 a I
2 2 2021-01-01 c I
3 3 2010-09-30 b III
4 4 2015-12-31 NA NA
5 5 2010-09-30 b I
6 6 2018-10-31 a III
7 7 2016-02-01 a I
8 8 2015-05-01 a II
9 9 2013-09-01 c I
10 10 2016-01-01 a III
您可以在将日期更改为Date类后使用fuzzyjoin
。
library(fuzzyjoin)
library(dplyr)
dt1 %>%
mutate(exit = as.Date(exit, '%d/%m/%Y')) %>%
fuzzy_left_join(dt2 %>%
mutate(across(starts_with('valid'), as.Date, '%d/%m/%Y')),
by = c('ID', 'exit' = 'valid_from', 'exit' = 'valid_until'),
match_fun = c(`==`, `>=`, `<=`)) %>%
select(ID = ID.x, exit, text1, text2)
# ID exit text1 text2
#1 1 2010-12-31 a I
#2 2 2021-01-01 c I
#3 3 2010-09-30 b III
#4 4 2015-12-31 <NA> <NA>
#5 5 2010-09-30 b I
#6 6 2018-10-31 a III
#7 7 2016-02-01 a I
#8 8 2015-05-01 a II
#9 9 2013-09-01 <NA> <NA>
#10 10 2016-01-01 a III