按R中最近的日期合并两个数据帧



我有两个数据帧,我想按最近的日期合并。df2的日期通常与df1的日期相同。还有一些df2的值根本不适合df1。合并后,我想知道合并日期之间的时间差。例如,"萨利"号适合两个高峰,"卡尔"号不适合任何人。我尝试了不同的解决方案,例如使用data.table roll="最新"功能

df1 <- data.frame(date = (c("23.11.2021 20:56:06", "23.11.2021 20:56:07","23.11.2021 20:56:08",
"23.11.2021 20:56:09","23.11.2021 20:56:10",
"23.11.2021 20:56:11", "23.11.2021 22:13:56",
"23.11.2021 22:13:57", "23.11.2021 22:13:58",
"23.11.2021 22:13:59", "24.11.2021 03:23:21",
"24.11.2021 03:23:22", "24.11.2021 03:23:23",
"24.11.2021 03:23:24", "24.11.2021 03:23:25",
"24.11.2021 03:24:34", "24.11.2021 03:24:35", 
"24.11.2021 03:24:36", "24.11.2021 03:24:37")),
value = (c(500, 900, 1000, 200, 300, 10, 1000, 450, 950, 600,
100, 750, 150, 200, 300, 400, 900, 1020, 800)))
df2 <- data.frame(date = (c("23.11.2021 20:55:47", "23.11.2021 21:17:48",
"23.11.2021 21:44:19", "23.11.2021 21:55:57",
"23.11.2021 22:16:01","23.11.2021 22:26:35",
"23.11.2021 22:28:43", "23.11.2021 23:14:52", 
"23.11.2021 23:32:30","23.11.2021 23:33:43",
"24.11.2021 03:11:30", "24.11.2021 03:23:37",
"24.11.2021 04:42:41", "24.11.2021 04:53:09",
"24.11.2021 04:58:19")),
value = (c("Ship Emma", "Ship Carl", "Ship Rudi", "Ship Tim", "Ship Amy",
"Ship Fred", "Ship Noemi"
, "Ship Fanny", "Ship Ole", "Ship Ally",
"Ship Hugo", "Ship Sally", "Ship Peter", "Ship Harry", "Ship Piet")))
result <- data.frame(date = (c("23.11.2021 20:56:06", "23.11.2021 20:56:07",
"23.11.2021 20:56:08", "23.11.2021 20:56:09",
"23.11.2021 20:56:10","23.11.2021 20:56:11",
"23.11.2021 22:13:56", "23.11.2021 22:13:57",
"23.11.2021 22:13:58", "23.11.2021 22:13:59",
"24.11.2021 03:23:21", "24.11.2021 03:23:22",
"24.11.2021 03:23:23", "24.11.2021 03:23:24",
"24.11.2021 03:23:25", "24.11.2021 03:24:34",
"24.11.2021 03:24:35", "24.11.2021 03:24:36",
"24.11.2021 03:24:37")),
value = (c(500, 900, 1000, 200, 300, 10, 1000, 450, 950, 600,
100, 750, 150, 200, 300, 400, 900, 1020, 800)), 
id = (c("Ship Emma", "Ship Emma", "Ship Emma", "Ship Emma", "Ship Emma",
"Ship Emma", "Ship Amy", "Ship Amy", "Ship Amy", "Ship Amy",
"Ship Sally", "Ship Sally", "Ship Sally", "Ship Sally",
"Ship Sally", "Ship Sally", "Ship Sally", "Ship Sally","Ship Sally")), 
difference = (c("00:00:19", "00:00:20", "00:00:21", "00:00:22", "00:00:23",
"00:00:24", "00:00:05", "00:00:04", "00:00:03", "00:00:02",
"00:00:16", "00:00:15", "00:00:14", "00:00:13", "00:00:12",
"00:00:57", "00:00:58", "00:00:59", "00:01:00")))
df1$date<- as.POSIXct(df1$date, tz = "GMT", format = "%d.%m.%Y %H:%M:%S")
df2$date<- as.POSIXct(df2$date, tz = "GMT", format = "%d.%m.%Y %H:%M:%S")
result$date<- as.POSIXct(result$date, tz = "GMT", format = "%d.%m.%Y %H:%M:%S")

我建议两种方法。第一个使用距离矩阵并执行df1到df2的left_join。即距离矩阵由给出

dateDist <- outer(pull(df1, date), pull(df2, date), "-") %>%
abs()

接下来,对于df1的每一行,距离最近的df2的行由给出

closest.df1 <- apply(dateDist, 1, which.min)

最后,手动执行合并:

cbind(rename_with(df1, ~paste0("df1.", "", .x)),
rename_with(df2[closest.df1,], ~paste0("df2.", "", .x)))
##>+                 df1.date df1.value            df2.date  df2.value
##>1    2021-11-23 20:56:06       500 2021-11-23 20:55:47  Ship Emma
##>1.1  2021-11-23 20:56:07       900 2021-11-23 20:55:47  Ship Emma
##>1.2  2021-11-23 20:56:08      1000 2021-11-23 20:55:47  Ship Emma
##>1.3  2021-11-23 20:56:09       200 2021-11-23 20:55:47  Ship Emma
##>1.4  2021-11-23 20:56:10       300 2021-11-23 20:55:47  Ship Emma
##>1.5  2021-11-23 20:56:11        10 2021-11-23 20:55:47  Ship Emma
##>5    2021-11-23 22:13:56      1000 2021-11-23 22:16:01   Ship Amy
##>5.1  2021-11-23 22:13:57       450 2021-11-23 22:16:01   Ship Amy
##>5.2  2021-11-23 22:13:58       950 2021-11-23 22:16:01   Ship Amy
##>5.3  2021-11-23 22:13:59       600 2021-11-23 22:16:01   Ship Amy
##>12   2021-11-24 03:23:21       100 2021-11-24 03:23:37 Ship Sally
##>12.1 2021-11-24 03:23:22       750 2021-11-24 03:23:37 Ship Sally
##>12.2 2021-11-24 03:23:23       150 2021-11-24 03:23:37 Ship Sally
##>12.3 2021-11-24 03:23:24       200 2021-11-24 03:23:37 Ship Sally
##>12.4 2021-11-24 03:23:25       300 2021-11-24 03:23:37 Ship Sally
##>12.5 2021-11-24 03:24:34       400 2021-11-24 03:23:37 Ship Sally
##>12.6 2021-11-24 03:24:35       900 2021-11-24 03:23:37 Ship Sally
##>12.7 2021-11-24 03:24:36      1020 2021-11-24 03:23:37 Ship Sally
##>12.8 2021-11-24 03:24:37       800 2021-11-24 03:23:37 Ship Sally

第二种方法包括首先计算df1df2的所有行的笛卡尔乘积,然后只选择具有最小距离的行。这里的技巧是使用inner_join(..., by =character())来获得两个数据帧的所有组合:

mutate(df1, id = row_number()) %>%
inner_join(mutate(df2, id = row_number()),by = character()) |>
mutate(dist = abs(date.x - date.y)) |>
group_by(id.x) |>
filter(dist == min(dist)) |>
select(-id.x, -id.y, -dist)
##>+ # A tibble: 19 × 7
##># Groups:   id.x [19]
##>   date.x              value.x  id.x date.y              value.y     id.y dist  
##>   <dttm>                <dbl> <int> <dttm>              <chr>      <int> <drtn>
##> 1 2021-11-23 20:56:06     500     1 2021-11-23 20:55:47 Ship Emma      1  19 s…
##> 2 2021-11-23 20:56:07     900     2 2021-11-23 20:55:47 Ship Emma      1  20 s…
##> 3 2021-11-23 20:56:08    1000     3 2021-11-23 20:55:47 Ship Emma      1  21 s…
##> 4 2021-11-23 20:56:09     200     4 2021-11-23 20:55:47 Ship Emma      1  22 s…
##> 5 2021-11-23 20:56:10     300     5 2021-11-23 20:55:47 Ship Emma      1  23 s…
##> 6 2021-11-23 20:56:11      10     6 2021-11-23 20:55:47 Ship Emma      1  24 s…
##> 7 2021-11-23 22:13:56    1000     7 2021-11-23 22:16:01 Ship Amy       5 125 s…
##> 8 2021-11-23 22:13:57     450     8 2021-11-23 22:16:01 Ship Amy       5 124 s…
##> 9 2021-11-23 22:13:58     950     9 2021-11-23 22:16:01 Ship Amy       5 123 s…
##>10 2021-11-23 22:13:59     600    10 2021-11-23 22:16:01 Ship Amy       5 122 s…
##>11 2021-11-24 03:23:21     100    11 2021-11-24 03:23:37 Ship Sally    12  16 s…
##>12 2021-11-24 03:23:22     750    12 2021-11-24 03:23:37 Ship Sally    12  15 s…
##>13 2021-11-24 03:23:23     150    13 2021-11-24 03:23:37 Ship Sally    12  14 s…
##>14 2021-11-24 03:23:24     200    14 2021-11-24 03:23:37 Ship Sally    12  13 s…
##>15 2021-11-24 03:23:25     300    15 2021-11-24 03:23:37 Ship Sally    12  12 s…
##>16 2021-11-24 03:24:34     400    16 2021-11-24 03:23:37 Ship Sally    12  57 s…
##>17 2021-11-24 03:24:35     900    17 2021-11-24 03:23:37 Ship Sally    12  58 s…
##>18 2021-11-24 03:24:36    1020    18 2021-11-24 03:23:37 Ship Sally    12  59 s…
##>19 2021-11-24 03:24:37     800    19 2021-11-24 03:23:37 Ship Sally    12  60 s…

最新更新