计算R中各组每天时间间隔的平均时间差(H:M)



我有一个看起来像这样的数据集:

id  gr       dt.DMYMS        dt.DMYMS.pos
1  A R21 01/03/19 05:39 2019-03-01 05:39:00
2  A R21 01/03/19 05:42 2019-03-01 05:42:00
3  A F23 01/03/19 06:23 2019-03-01 06:23:00
4  A F23 01/03/19 06:26 2019-03-01 06:26:00
5  A F23 01/03/19 06:28 2019-03-01 06:28:00
6  A F24 01/03/19 07:08 2019-03-01 07:08:00
[...]

这是获取它的代码:

dat = structure(list(id = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B"), gr = c("R21", "R21", "F23", "F23", 
"F23", "F24", "F24", "F25", "F25", "F26", "F26", "F27", "F27", 
"R29", "R29", "R31", "R31", "N34", "N34", "F35", "F35", "F36", 
"F36", "F37", "F37", "F39", "F39", "R40", "R40", "R17", "R17", 
"F18", "F18", "F19", "F19", "F20", "F20", "R21", "R21", "F22", 
"F22", "F23", "F23", "F25", "F25", "Gr1", "Gr1", "F28", "F28", 
"F30", "F30", "F32", "F32", "R33", "R33", "F37", "F37", "F39", 
"F39", "R40", "R40", "F41", "F41", "F42", "F42", "R43", "R43", 
"F1", "F1", "R15", "R15", "R15", "F17", "F17", "F19", "F19", 
"F20", "F20"), dt.DMYMS = c("01/03/19 05:39", "01/03/19 05:42", 
"01/03/19 06:23", "01/03/19 06:26", "01/03/19 06:28", "01/03/19 07:08", 
"01/03/19 07:10", "03/03/19 06:17", "03/03/19 06:20", "03/03/19 06:58", 
"03/03/19 07:00", "03/03/19 07:49", "03/03/19 07:50", "04/03/19 05:28", 
"04/03/19 05:31", "05/03/19 05:33", "05/03/19 05:36", "08/03/19 06:21", 
"08/03/19 06:24", "08/03/19 09:24", "08/03/19 09:28", "08/03/19 17:33", 
"08/03/19 17:39", "11/03/19 06:24", "11/03/19 06:26", "12/03/19 06:21", 
"12/03/19 06:24", "13/03/19 05:38", "13/03/19 05:40", "28/02/19 05:42", 
"28/02/19 05:46", "28/02/19 06:27", "28/02/19 06:37", "28/02/19 06:59", 
"28/02/19 07:04", "28/02/19 09:30", "28/02/19 09:32", "01/03/19 05:43", 
"01/03/19 05:47", "01/03/19 06:10", "01/03/19 06:11", "01/03/19 06:34", 
"01/03/19 06:35", "01/03/19 07:17", "01/03/19 07:17", "01/03/19 08:24", 
"01/03/19 08:26", "01/03/19 09:30", "01/03/19 09:37", "03/03/19 06:41", 
"03/03/19 06:41", "03/03/19 09:37", "03/03/19 09:37", "04/03/19 05:26", 
"04/03/19 05:54", "05/03/19 08:55", "05/03/19 09:12", "07/03/19 06:07", 
"07/03/19 06:27", "08/03/19 05:39", "08/03/19 05:55", "08/03/19 09:37", 
"08/03/19 09:51", "12/03/19 06:31", "12/03/19 06:52", "13/03/19 05:43", 
"13/03/19 05:52", "26/02/19 10:59", "26/02/19 11:57", "28/02/19 05:39", 
"28/02/19 05:49", "28/02/19 05:57", "28/02/19 06:55", "28/02/19 06:59", 
"28/02/19 07:30", "28/02/19 07:31", "28/02/19 08:50", "28/02/19 08:51"
), dt.DMYMS.pos = structure(c(1551418740, 1551418920, 1551421380, 
1551421560, 1551421680, 1551424080, 1551424200, 1551593820, 1551594000, 
1551596280, 1551596400, 1551599340, 1551599400, 1551677280, 1551677460, 
1551763980, 1551764160, 1552026060, 1552026240, 1552037040, 1552037280, 
1552066380, 1552066740, 1552285440, 1552285560, 1552371660, 1552371840, 
1552455480, 1552455600, 1551332520, 1551332760, 1551335220, 1551335820, 
1551337140, 1551337440, 1551346200, 1551346320, 1551418980, 1551419220, 
1551420600, 1551420660, 1551422040, 1551422100, 1551424620, 1551424620, 
1551428640, 1551428760, 1551432600, 1551433020, 1551595260, 1551595260, 
1551605820, 1551605820, 1551677160, 1551678840, 1551776100, 1551777120, 
1551938820, 1551940020, 1552023540, 1552024500, 1552037820, 1552038660, 
1552372260, 1552373520, 1552455780, 1552456320, 1551178740, 1551182220, 
1551332340, 1551332940, 1551333420, 1551336900, 1551337140, 1551339000, 
1551339060, 1551343800, 1551343860), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -78L), class = "data.frame")

我想通过gr来计算组,并计算gr中两个相同组之间的平均值(每个gr有2个点(。

这可以用以下代码完成:

dat %>%
dplyr::group_by(id, gr) %>% 
dplyr::summarise(mean.pergroup = mean(dt.DMYMS.pos)) %>% 
dplyr::arrange(id, gr, mean.pergroup)

哪个给出:

`summarise()` regrouping output by 'id' (override with `.groups` argument)
# A tibble: 38 x 3
# Groups:   id [2]
id    gr    mean.pergroup      
<chr> <chr> <dttm>             
1 A     F18   2019-02-28 06:32:00
2 A     F19   2019-02-28 07:01:30
3 A     F20   2019-02-28 09:31:00
4 A     F23   2019-03-01 06:25:40
5 A     F24   2019-03-01 07:09:00
6 A     F25   2019-03-03 06:18:30
[...]

然后我想";组";通过,找到时间上的连续差异,并计算每天的平均差异。即,查看上面的第1、2和3行,对于第2019-02-28天的idA,它将是mean(07:01:30 -06:32:00 + 09:31:00 - 07:01:30) = mean(29.5 min+ 149.5min) = 89.5 min

我可以得到时间上的差异,但不是所有的线路:

test = dat %>%
dplyr::group_by(id, gr) %>% 
dplyr::summarise(mean.pergroup = mean(dt.DMYMS.pos)) %>% 
dplyr::arrange(id, gr, mean.pergroup)
test[1:3,"mean.pergroup"][3,] - test[1:3,"mean.pergroup"][2,]

提供

mean.pergroup
1 2.491667 hours

对于第3行和第2行之间的差异…

我只能想到一个for循环,但我很确定在dplyr中有一种方法可以做到这一点。

在数据集中有多行的日期为2019-02-28,但如果只考虑连续值,则可以在data.table中使用rleid

dat %>%
dplyr::group_by(id, gr) %>% 
dplyr::summarise(mean.pergroup = mean(dt.DMYMS.pos)) %>% 
dplyr::ungroup() %>%
dplyr::arrange(id, gr, mean.pergroup) %>%
dplyr::mutate(date = as.Date(mean.pergroup)) %>%
dplyr::group_by(grp = data.table::rleid(date)) %>%
dplyr::summarise(date = first(date),
difference = mean(difftime(mean.pergroup,lag(mean.pergroup), 
units = 'mins'), na.rm = TRUE))

最新更新