考虑这个数据帧:
idPerson idAppt decision date
1 A 1 a 2021-09-10
2 A 1 b 2021-09-11
3 A 1 c 2021-09-12
4 A 1 d 2021-09-13
5 A 2 a 2021-09-20
6 A 2 b 2021-09-21
7 A 3 a 2021-09-10
8 A 3 b 2021-09-11
9 B 1 a 2021-09-10
10 B 1 b 2021-09-11
11 B 1 c 2021-09-12
12 B 1 d 2021-09-13
13 B 2 a 2021-09-11
14 B 2 b 2021-09-12
15 B 3 a 2021-09-14
16 B 3 b 2021-09-15
对于每组idPerson
和idAppt
,我想得到一个date2
列,但有一定的条件:
- 对于
decision == "a"
开始时间晚于具有相同idPerson
的任何其他idAppt
组的decision == "d"
日期的任何组(idPerson
xidAppt
(,报告该组的decision == "d"
的日期 - 对于不符合此要求的任何其他组,
date2
应为该给定idPerson
的最早日期
哪个产生这个数据帧:
idPerson idAppt decision date date2
1 A 1 a 2021-09-10 2021-09-10
2 A 1 b 2021-09-11 2021-09-10
3 A 1 c 2021-09-12 2021-09-10
4 A 1 d 2021-09-13 2021-09-10
5 A 2 a 2021-09-20 2021-09-13
6 A 2 b 2021-09-21 2021-09-13
7 A 3 a 2021-09-10 2021-09-10
8 A 3 b 2021-09-11 2021-09-10
9 B 1 a 2021-09-10 2021-09-10
10 B 1 b 2021-09-11 2021-09-10
11 B 1 c 2021-09-12 2021-09-10
12 B 1 d 2021-09-13 2021-09-10
13 B 2 a 2021-09-11 2021-09-10
14 B 2 b 2021-09-12 2021-09-10
15 B 3 a 2021-09-14 2021-09-13
16 B 3 b 2021-09-15 2021-09-13
数据:
df <- structure(list(idPerson = c("A", "A", "A", "A", "A", "A", "A",
"A", "B", "B", "B", "B", "B", "B", "B", "B"), idAppt = c(1L,
1L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L),
decision = c("a", "b", "c", "d", "a", "b", "a", "b", "a",
"b", "c", "d", "a", "b", "a", "b"), date = structure(c(18880,
18881, 18882, 18883, 18890, 18891, 18880, 18881, 18880, 18881,
18882, 18883, 18881, 18882, 18884, 18885), class = "Date")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -16L))
EO <- structure(list(idPerson = c("A", "A", "A", "A", "A", "A", "A",
"A", "B", "B", "B", "B", "B", "B", "B", "B"), idAppt = c(1L,
1L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L),
decision = c("a", "b", "c", "d", "a", "b", "a", "b", "a",
"b", "c", "d", "a", "b", "a", "b"), date = structure(c(18880,
18881, 18882, 18883, 18890, 18891, 18880, 18881, 18880, 18881,
18882, 18883, 18881, 18882, 18884, 18885), class = "Date"),
date2 = c("2021-09-10", "2021-09-10", "2021-09-10", "2021-09-10",
"2021-09-13", "2021-09-13", "2021-09-10", "2021-09-10", "2021-09-10",
"2021-09-10", "2021-09-10", "2021-09-10", "2021-09-10", "2021-09-10",
"2021-09-13", "2021-09-13")), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
df %>%
group_by(idPerson) %>%
mutate(d_date = min(date[decision == "d"]), min_date_person = min(date)) %>%
group_by(idPerson, idAppt) %>%
mutate(date2 = if_else(date[decision == "a"] > d_date, d_date, min_date_person)) %>%
ungroup()
# # A tibble: 16 × 7
# idPerson idAppt decision date d_date min_date_person date2
# <chr> <int> <chr> <date> <date> <date> <date>
# 1 A 1 a 2021-09-10 2021-09-13 2021-09-10 2021-09-10
# 2 A 1 b 2021-09-11 2021-09-13 2021-09-10 2021-09-10
# 3 A 1 c 2021-09-12 2021-09-13 2021-09-10 2021-09-10
# 4 A 1 d 2021-09-13 2021-09-13 2021-09-10 2021-09-10
# 5 A 2 a 2021-09-20 2021-09-13 2021-09-10 2021-09-13
# 6 A 2 b 2021-09-21 2021-09-13 2021-09-10 2021-09-13
# 7 A 3 a 2021-09-10 2021-09-13 2021-09-10 2021-09-10
# 8 A 3 b 2021-09-11 2021-09-13 2021-09-10 2021-09-10
# 9 B 1 a 2021-09-10 2021-09-13 2021-09-10 2021-09-10
# 10 B 1 b 2021-09-11 2021-09-13 2021-09-10 2021-09-10
# 11 B 1 c 2021-09-12 2021-09-13 2021-09-10 2021-09-10
# 12 B 1 d 2021-09-13 2021-09-13 2021-09-10 2021-09-10
# 13 B 2 a 2021-09-11 2021-09-13 2021-09-10 2021-09-10
# 14 B 2 b 2021-09-12 2021-09-13 2021-09-10 2021-09-10
# 15 B 3 a 2021-09-14 2021-09-13 2021-09-10 2021-09-13
# 16 B 3 b 2021-09-15 2021-09-13 2021-09-10 2021-09-13
#Select earlist date for decision d by idPerson
df_d <- df%>%
filter(decision=="d")%>%
group_by(idPerson)%>%
arrange(date)%>%
slice(1)%>%
rename(earliest_d=date)%>%
select(idPerson,earliest_d)%>%
ungroup
#Select earlist date for decision a
df_a <- df%>%
filter(decision=="a")%>%
rename(earliest_a=date)%>%
select(-decision)
# merge above datasets with df, create date2 by given conditions
df2 <- df%>%
base::merge(df_d,by=c("idPerson"))%>%
base::merge(df_a,by=c("idPerson","idAppt"))%>%
mutate(date2=case_when(earliest_d>earliest_a~earliest_a,
T~earliest_d))%>%
select(-earliest_d,-earliest_a)
df2
输出:
> df2
idPerson idAppt decision date date2
1 A 1 a 2021-09-10 2021-09-10
2 A 1 b 2021-09-11 2021-09-10
3 A 1 c 2021-09-12 2021-09-10
4 A 1 d 2021-09-13 2021-09-10
5 A 2 a 2021-09-20 2021-09-13
6 A 2 b 2021-09-21 2021-09-13
7 A 3 a 2021-09-10 2021-09-10
8 A 3 b 2021-09-11 2021-09-10
9 B 1 a 2021-09-10 2021-09-10
10 B 1 b 2021-09-11 2021-09-10
11 B 1 c 2021-09-12 2021-09-10
12 B 1 d 2021-09-13 2021-09-10
13 B 2 a 2021-09-11 2021-09-11
14 B 2 b 2021-09-12 2021-09-11
15 B 3 a 2021-09-14 2021-09-13
16 B 3 b 2021-09-15 2021-09-13