我正在做一个整合连续时间间隔的项目。
数据集如下所示:
df <- data.frame(id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
degree=c(2,2,2,2,2,2,2,2,2,2),
start_date=c(as.Date("2016-02-22"),
as.Date("2016-03-07"),
as.Date("2016-03-21"),
as.Date("2016-04-04"),
as.Date("2016-04-18"),
as.Date("2016-05-02"),
as.Date("2016-05-16"),
as.Date("2016-06-01"),
as.Date("2016-06-13"),
as.Date("2016-06-27")),
stop_date= c(as.Date("2016-02-22"),
as.Date("2016-03-21"),
as.Date("2016-04-04"),
as.Date("2016-04-18"),
as.Date("2016-05-02"),
as.Date("2016-05-16"),
as.Date("2016-06-01"),
as.Date("2016-06-13"),
as.Date("2016-06-27"),
as.Date("2016-07-25"))
我想把它们积分到一个周期
df_result <- data.frame(id=c(1,1),
degree=c(2,2),
start_date=c(as.Date("2016-02-22"),
as.Date("2016-03-07")),
stop_date= c(as.Date("2016-02-22"),
as.Date("2016-07-25")))
我试过下面的代码:
df = df %>%
group_by(id, degree) %>%
mutate(
isConsecutive = lead(start_date) - stop_date == 0,
isConsecutive = ifelse(
is.na(isConsecutive) & lag(isConsecutive) == TRUE, FALSE, isConsecutive),
grp = cumsum(isConsecutive)) %>%
group_by(id, degree, grp) %>%
mutate(start_date = min(start_date), stop_date = max(stop_date)) %>%
slice(1) %>%
ungroup() %>%
select(-isConsecutive, -grp)
我的代码不工作,因为它只做了一次集成(即只集成日期在第一行和第二行,但不与第三行集成)
在lead
或lag
中有一个default
参数,默认情况下返回NA
,NA
在操作中返回NA
,除非我们使用is.na
来纠正它。这里,在lead
中修复很简单,即将default
更改为' last(start_date)
library(dplyr)
df %>%
group_by(id, degree) %>%
mutate(
isConsecutive = lead(start_date, default = last(start_date)) - stop_date == 0,
isConsecutive = ifelse(
is.na(isConsecutive) & lag(isConsecutive) == TRUE, FALSE, isConsecutive),
grp = cumsum(isConsecutive)) %>%
group_by(id, degree, grp) %>%
mutate(start_date = min(start_date), stop_date = max(stop_date)) %>%
slice(1) %>%
ungroup() %>%
select(-isConsecutive, -grp)
与产出
# A tibble: 1 x 4
id degree start_date stop_date
<dbl> <dbl> <date> <date>
1 1 2 2020-01-01 2020-01-20
对于新数据,我们可以使用
library(data.table)
df %>%
group_by(id, degree) %>%
mutate(grp = rleid(as.numeric(difftime(lead(start_date,
default = last(start_date)), stop_date, units = 'day')) > 0)) %>%
group_by(grp, .add = TRUE) %>%
summarise(start_date = first(start_date),
stop_date = last(stop_date), .groups = 'drop')
# A tibble: 2 x 5
id degree grp start_date stop_date
<dbl> <dbl> <int> <date> <date>
1 1 2 1 2016-02-22 2016-02-22
2 1 2 2 2016-03-07 2016-07-25
library(dplyr)
df %>%
group_by(id, degree) %>%
summarise(start_date = min(start_date), stop_date = max(stop_date))
id degree start_date stop_date
<dbl> <dbl> <date> <date>
1 1 2 2020-01-01 2020-01-20