我想合并两个数据集。我想在 ID 和日期上合并它。但是,某些日期是在它们应该在另一个日期合并的行之后的 1-2 天,因此它们被排除在外。如何在日期合并但允许日期之间最多间隔 2 天?
我的数据:
#Random letters to fill out a pathology report
pathRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse=""))
pathDate<-as.Date(c("1993-12-22","1994-05-16","1992-07-20","1996-06-02","1992-04-20","1996-08-30","1992-01-26","1991-03-23","1995-12-28","1995-07-15","1993-04-04","1994-01-11","1999-08-21","1993-11-10","1994-02-26","1992-08-06","1993-06-29","1997-03-08","1998-03-03","1998-04-17"))
#Random Numbers
pathHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634")
#Create the dataframe
pathdf<-data.frame(pathRep,pathDate,pathHospitalNum)
#Random letters to fill out a pathology report
EndoRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse=""))
EndoDate<-as.Date(c("1993-12-22","1994-05-14","1992-07-19","1996-06-01","1992-04-20","1996-08-30","1992-01-24","1991-03-21","1995-12-28","1995-07-15","1993-04-02","1994-01-10","1999-08-21","1993-11-10","1994-02-26","1992-08-05","1993-06-29","1997-03-07","1998-03-03","1998-04-17"))
#Random Numbers
EndoHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634")
#Create the dataframe:
Endodf<-data.frame(EndoRep,EndoDate,EndoHospitalNum)
这只是合并了确切的日期:
merge(Endodf,pathdf,by=c("Date","HospNum"))
我想也许我可以创建一个 difftime 列,但我想我最终会将每个日期与每个日期进行比较,这可能很耗时?
虽然@alaybourn对data.table
滚动连接的回答非常好,但我将添加另一个选项来解决问题的"允许日期之间最多 2 天的差异"部分(但主要只是为了分享对fuzzyjoin
包的一些热爱(。
library(dplyr)
library(fuzzyjoin)
Path <-
data.frame(
PathDate = as.Date(c("1993-12-22", "1994-05-16", "1992-07-20", "1996-06-02", "1992-04-20", "1996-08-30", "1992-01-26", "1991-03-23", "1995-12-28", "1995-07-15", "1993-04-04", "1994-01-11", "1999-08-21", "1993-11-10", "1994-02-26", "1992-08-06", "1993-06-29", "1997-03-08", "1998-03-03", "1998-04-17")),
PathHospNum = c("H432243", "T662272", "G424284", "W787634", "H432243", "Y980037", "H432243", "W787634", "Y980037", "E432243", "U874287", "Y980037", "U874287", "W787634", "Y980037", "H432243", "Y980037", "E432243", "W787634", "W787634"),
PathRep = replicate(20, paste(sample(LETTERS, 10, replace = T), collapse = ""))
)
Endo <-
data.frame(
EndoDate = as.Date(c("1993-12-22", "1994-05-14", "1992-07-19", "1996-06-01", "1992-04-20", "1996-08-30", "1992-01-24", "1991-03-21", "1995-12-28", "1995-07-15", "1993-04-02", "1994-01-10", "1999-08-21", "1993-11-10", "1994-02-26", "1992-08-05", "1993-06-29", "1997-03-07", "1998-03-03", "1998-04-17")),
EndoHospNum = c("H432243", "T662272", "G424284", "W787634", "H432243", "Y980037", "H432243", "W787634", "Y980037", "E432243", "U874287", "Y980037", "U874287", "W787634", "Y980037", "H432243", "Y980037", "E432243", "W787634", "W787634"),
EndoRep = replicate(20, paste(sample(LETTERS, 10, replace = T), collapse = ""))
)
Path$date <- as.numeric(Path$PathDate)
Endo$date <- as.numeric(Endo$EndoDate)
Result <-
fuzzyjoin::difference_full_join(Endo, Path, by = 'date', max_dist = 2, distance_col = 'Days') %>%
filter(EndoHospNum == PathHospNum) %>%
select(HospNum = EndoHospNum, EndoDate, PathDate, Days, EndoRep, PathRep)
Result
HospNum EndoDate PathDate Days EndoRep PathRep
1 H432243 1993-12-22 1993-12-22 0 YBGDMGMZOJ HISSGSCRFR
2 T662272 1994-05-14 1994-05-16 2 ANAPSCKUEB HIDIFHBDBL
3 G424284 1992-07-19 1992-07-20 1 HKOCQZAXDU PLXGUPHQBM
4 W787634 1996-06-01 1996-06-02 1 OTPLUZBLAF KGVILKEHLI
5 H432243 1992-04-20 1992-04-20 0 GRWJUQPNET UGCKMNKDLW
6 Y980037 1996-08-30 1996-08-30 0 ORUVMMGGAV EYOWEYAZFK
7 H432243 1992-01-24 1992-01-26 2 JVSPGIVXEM LCNXQNVGGR
8 W787634 1991-03-21 1991-03-23 2 WXZNHJIBZW OTXKNTYNKV
9 Y980037 1995-12-28 1995-12-28 0 PQQQLKTYPG UAMMKJZRFG
10 E432243 1995-07-15 1995-07-15 0 VYLDWUNAFP EXNTQSYVJM
11 U874287 1993-04-02 1993-04-04 2 MTBBBVULOD CTKXUKEOQG
12 Y980037 1994-01-10 1994-01-11 1 TRZWBYAUZR XIHXMOEFVP
13 U874287 1999-08-21 1999-08-21 0 DYBUWJIAZB KFFGYNQUYM
14 W787634 1993-11-10 1993-11-10 0 CBWRBZAPAF KYUOZSLIGF
15 Y980037 1994-02-26 1994-02-26 0 GDUFEYZQFU BUSFQIJDHK
16 H432243 1992-08-05 1992-08-06 1 JMGSCWTHOI ZRCJFDFNCX
17 Y980037 1993-06-29 1993-06-29 0 HDTGHCMORL EQYWNJHOET
18 E432243 1997-03-07 1997-03-08 1 WIMMVJHDSE LYLDELIBYK
19 W787634 1998-03-03 1998-03-03 0 GRHBDHEWJF AWDYEQZZWY
20 W787634 1998-04-17 1998-04-17 0 AOFIXWLZDT BBUEROUIWO
如果您在data.table
中使用roll="nearest"
选项,它将适用于此集合,但如果您尝试不加入日期> 2 天不同,它将失败。
library(data.table)
#Random letters to fill out a pathology report
pathRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse=""))
pathDate<-as.Date(c("1993-12-22","1994-05-16","1992-07-20","1996-06-02","1992-04-20","1996-08-30","1992-01-26","1991-03-23","1995-12-28","1995-07-15","1993-04-04","1994-01-11","1999-08-21","1993-11-10","1994-02-26","1992-08-06","1993-06-29","1997-03-08","1998-03-03","1998-04-17"))
#Random Numbers
pathHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634")
#Create the data table and set key fields
pathdt<-data.table(pathRep,pathDate,pathHospitalNum)
setkey(pathdt, pathHospitalNum, pathDate)
#Random letters to fill out a pathology report
EndoRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse=""))
EndoDate<-as.Date(c("1993-12-22","1994-05-14","1992-07-19","1996-06-01","1992-04-20","1996-08-30","1992-01-24","1991-03-21","1995-12-28","1995-07-15","1993-04-02","1994-01-10","1999-08-21","1993-11-10","1994-02-26","1992-08-05","1993-06-29","1997-03-07","1998-03-03","1998-04-17"))
#Random Numbers
EndoHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634")
#Create the data table and set keys
Endodt<-data.table(EndoRep,EndoDate,EndoHospitalNum)
setkey(Endodt, EndoHospitalNum, EndoDate)
#run the join
Endodt[pathdt,roll="nearest"]