r-连接(合并)表时,在开始和结束时间间隔内重复数据



我有两个必须连接的数据帧。但当连接两个数据表时,我想在开始时间和结束时间内复制df2的每一行。新数据帧的其余行应显示为NA

我尝试过使用left join,但它不会在开始和结束时间内复制行。

df <- dplyr::left_join(df1, df2, by = "Session_start")

这两个数据帧如下所示。

head(df1)
#         Session_start Robot_ID
# 1 2022-07-07 00:05:19       R1
# 2 2022-07-07 00:05:20       R2
# 3 2022-07-07 00:05:21       R3
# 4 2022-07-07 00:05:22       R4
# 5 2022-07-07 00:05:23       R5
# 6 2022-07-07 00:05:24       R6
df2
#         Session_start         Session_End Animal_ID
# 1 2022-07-07 00:05:19 2022-07-07 00:05:21       ID1
# 2 2022-07-07 00:05:24 2022-07-07 00:05:26       ID2
# 3 2022-07-07 00:05:27 2022-07-07 00:05:31       ID3
# 4 2022-07-07 00:05:33 2022-07-07 00:05:34       ID4

所需输出为:

机器人ID2022-07-07 00:05:19R12022-07-0700:05:21>ID12022-07-07 00:05:20R22022-07-07 00:05:21>ID12022-07-07 00:05:21R32022-07-0700:05:21>ID12022-07-07 00:05:22R4NA2022-07-07 00:05:23R5ANA2022-07-07 00:05:24R62022-07-07 00:05:26ID12022-07-07 00:05:25R72022-07-07 00:05:26ID22022-07-07 00:05:26R82022-07-07 00:05:26>ID22022-07-07 00:05:27R92022-07-07 00:05:28R102022-07-07 00:05:31>ID32022-07-07 00:05:29R112022-07-07 00:05:31ID32022-07-07 00:05:30R122022-07-0700:05:31>ID32022-07-07 00:05:31R132022-07-07 00:05:31>ID32022-07-07 00:05:32R14NA2022-07-07 00:05:33R152022-07-07 00:05:34>ID42022-07-07 00:05:34R162022-07-07 00:05:34ID4

data.table使用非equi,更新联接可能会使其变得更好:

library(data.table)
setDT(df1)
setDT(df2)
df1[
df2,
on=.(Session_start>=Session_start, Session_start<=Session_End),
c("Animal_ID","Session_End") := .(i.Animal_ID, i.Session_End)
]
df1
##          Session_start Robot_ID Animal_ID         Session_End
## 1: 2022-07-07 08:05:19       R1       ID1 2022-07-07 08:05:21
## 2: 2022-07-07 08:05:20       R2       ID1 2022-07-07 08:05:21
## 3: 2022-07-07 08:05:21       R3       ID1 2022-07-07 08:05:21
## 4: 2022-07-07 08:05:22       R4      <NA>                <NA>
## 5: 2022-07-07 08:05:23       R5      <NA>                <NA>
## 6: 2022-07-07 08:05:24       R6       ID2 2022-07-07 08:05:26
## 7: 2022-07-07 08:05:25       R7       ID2 2022-07-07 08:05:26
## 8: 2022-07-07 08:05:26       R8       ID2 2022-07-07 08:05:26
## 9: 2022-07-07 08:05:27       R9       ID3 2022-07-07 08:05:31
##10: 2022-07-07 08:05:28      R10       ID3 2022-07-07 08:05:31
##11: 2022-07-07 08:05:29      R11       ID3 2022-07-07 08:05:31
##12: 2022-07-07 08:05:30      R12       ID3 2022-07-07 08:05:31
##13: 2022-07-07 08:05:31      R13       ID3 2022-07-07 08:05:31
##14: 2022-07-07 08:05:32      R14      <NA>                <NA>
##15: 2022-07-07 08:05:33      R15       ID4 2022-07-07 08:05:34
##16: 2022-07-07 08:05:34      R16       ID4 2022-07-07 08:05:34

首先,找到索引w,其中df的会话开始位于使用outer()df2的会话间隔之间。接下来CCD_ 8将它们发送到相应的切片。最后merge余数。

w <- outer(df1[, 1], as.data.frame(t(df2[1:2])), 
Vectorize((x, y) x >= y[1] & x <= y[2])) |>
apply(2, which)
Map((x, y) cbind(df1[x, ], df2[y, -1]), w, seq_len(nrow(df2))) |>
do.call(what=rbind) |> merge(df1, all=TRUE)
#          Session_start Robot_ID         Session_End Animal_ID
# 1  2022-07-07 00:05:19       R1 2022-07-07 00:05:21       ID1
# 2  2022-07-07 00:05:20       R2 2022-07-07 00:05:21       ID1
# 3  2022-07-07 00:05:21       R3 2022-07-07 00:05:21       ID1
# 4  2022-07-07 00:05:22       R4                <NA>      <NA>
# 5  2022-07-07 00:05:23       R5                <NA>      <NA>
# 6  2022-07-07 00:05:24       R6 2022-07-07 00:05:26       ID2
# 7  2022-07-07 00:05:25       R7 2022-07-07 00:05:26       ID2
# 8  2022-07-07 00:05:26       R8 2022-07-07 00:05:26       ID2
# 9  2022-07-07 00:05:27       R9 2022-07-07 00:05:31       ID3
# 10 2022-07-07 00:05:28      R10 2022-07-07 00:05:31       ID3
# 11 2022-07-07 00:05:29      R11 2022-07-07 00:05:31       ID3
# 12 2022-07-07 00:05:30      R12 2022-07-07 00:05:31       ID3
# 13 2022-07-07 00:05:31      R13 2022-07-07 00:05:31       ID3
# 14 2022-07-07 00:05:32      R14                <NA>      <NA>
# 15 2022-07-07 00:05:33      R15 2022-07-07 00:05:34       ID4
# 16 2022-07-07 00:05:34      R16 2022-07-07 00:05:34       ID4

注意:即使解决方案在没有它的情况下也能工作(按字母顺序比较日期),但在处理日期时间时应始终使用"POSIXct"格式。如果你还没有,请将其转换为

df1$Session_start <- as.POSIXct(df1$Session_start)
df2[1:2] <- lapply(df2[1:2], as.POSIXct)

这个答案比atemail和jay.sf的要长得多,但我仍然会发布它,所以你有更多的想法。

我的方法是使用辅助变量,使用lubridate以确保使用正确的格式,然后开始传播Animal_ID和Session_End数据。

# Loading libraries -------------------------------------------------------
library(dplyr)
library(lubridate)
# Defining datasets -------------------------------------------------------
Session_start <-
c(
"2022-07-07 00:05:19",
"2022-07-07 00:05:24",
"2022-07-07 00:05:27",
"2022-07-07 00:05:33"
)
Session_End <-
c(
"2022-07-07 00:05:21",
"2022-07-07 00:05:26",
"2022-07-07 00:05:31",
"2022-07-07 00:05:34"
)
Animal_ID <- c("ID1", "ID2", "ID3", "ID4")
df2 <- data.frame(Session_start, Session_End, Animal_ID)
Session_start <-
c(
"2022-07-07 00:05:19",
"2022-07-07 00:05:20",
"2022-07-07 00:05:21",
"2022-07-07 00:05:22",
"2022-07-07 00:05:23",
"2022-07-07 00:05:24",
"2022-07-07 00:05:25",
"2022-07-07 00:05:26",
"2022-07-07 00:05:27",
"2022-07-07 00:05:28",
"2022-07-07 00:05:29",
"2022-07-07 00:05:30",
"2022-07-07 00:05:31",
"2022-07-07 00:05:32",
"2022-07-07 00:05:33",
"2022-07-07 00:05:34"
)
Robot_ID <-
c(
"R1",
"R2",
"R3",
"R4",
"R5",
"R6",
"R7",
"R8",
"R9",
"R10",
"R11",
"R12",
"R13",
"R14",
"R15",
"R16"
)
df1 <- data.frame(Session_start, Robot_ID)
# Joining with data propagation -------------------------------------------
df <-
dplyr::left_join(df1, df2, by = "Session_start") |>
arrange(Session_start) |>
mutate(
Session_start =
Session_start |>
lubridate::as_datetime(),
Session_End =
Session_End |>
lubridate::as_datetime()
) |>
mutate(
is_na_Session_End = if_else(
condition = is.na(Session_End),
true = FALSE,
false = TRUE
),
number_of_non_NA_Session_End = cumsum(is_na_Session_End)
) |>
group_by(number_of_non_NA_Session_End) |>
mutate(Session_End =
Session_End |>
first(),
Animal_ID =
Animal_ID |>
first()) |>
mutate(
Session_End = if_else(
condition = Session_start <= Session_End,
true = Session_End,
false = NA_POSIXct_
),
Animal_ID = if_else(
condition = Session_start <= Session_End,
true = Animal_ID,
false = NA_character_
)
) |>
ungroup() |>
select(-is_na_Session_End,
-number_of_non_NA_Session_End) |>
as.data.frame()
df

这是我的答案。这有点简单,但它可以很好地与您的数据集配合使用:

# Package needed
library(dplyr)
# First, preprocess the data
df1 <- df1 %>% 
mutate_at(vars(Session_start), as.POSIXct)
df2 <- df2 %>% 
mutate_at(vars(Session_start, Session_End), as.POSIXct)
df3 <- merge(df1, df2, all = TRUE)
# Then, fill the voids 
for (i in 1:nrow(df3)) {

if (!is.na(df3$Session_End[i])) {

session_end1 <- df3$Session_End[i]
animal_id1 <- df3$Animal_ID[i]

} else {

if (i < nrow(df3)) {

if (df3$Session_start[i] < df3$Session_start[i+1]) {

df3$Session_End[i] <- session_end1
df3$Animal_ID[i] <- animal_id1

}

} else if (i == nrow(df3)) {

df3$Session_End[i] <- session_end1
df3$Animal_ID[i] <- animal_id1

}

}

}

相关内容

  • 没有找到相关文章

最新更新