我有一个看起来像这样的数据集:
id gr dt.DMYMS dt.DMYMS.pos
1 A R21 01/03/19 05:39 2019-03-01 05:39:00
2 A R21 01/03/19 05:42 2019-03-01 05:42:00
3 A F23 01/03/19 06:23 2019-03-01 06:23:00
4 A F23 01/03/19 06:26 2019-03-01 06:26:00
5 A F23 01/03/19 06:28 2019-03-01 06:28:00
6 A F24 01/03/19 07:08 2019-03-01 07:08:00
[...]
这是获取它的代码:
dat = structure(list(id = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B"), gr = c("R21", "R21", "F23", "F23",
"F23", "F24", "F24", "F25", "F25", "F26", "F26", "F27", "F27",
"R29", "R29", "R31", "R31", "N34", "N34", "F35", "F35", "F36",
"F36", "F37", "F37", "F39", "F39", "R40", "R40", "R17", "R17",
"F18", "F18", "F19", "F19", "F20", "F20", "R21", "R21", "F22",
"F22", "F23", "F23", "F25", "F25", "Gr1", "Gr1", "F28", "F28",
"F30", "F30", "F32", "F32", "R33", "R33", "F37", "F37", "F39",
"F39", "R40", "R40", "F41", "F41", "F42", "F42", "R43", "R43",
"F1", "F1", "R15", "R15", "R15", "F17", "F17", "F19", "F19",
"F20", "F20"), dt.DMYMS = c("01/03/19 05:39", "01/03/19 05:42",
"01/03/19 06:23", "01/03/19 06:26", "01/03/19 06:28", "01/03/19 07:08",
"01/03/19 07:10", "03/03/19 06:17", "03/03/19 06:20", "03/03/19 06:58",
"03/03/19 07:00", "03/03/19 07:49", "03/03/19 07:50", "04/03/19 05:28",
"04/03/19 05:31", "05/03/19 05:33", "05/03/19 05:36", "08/03/19 06:21",
"08/03/19 06:24", "08/03/19 09:24", "08/03/19 09:28", "08/03/19 17:33",
"08/03/19 17:39", "11/03/19 06:24", "11/03/19 06:26", "12/03/19 06:21",
"12/03/19 06:24", "13/03/19 05:38", "13/03/19 05:40", "28/02/19 05:42",
"28/02/19 05:46", "28/02/19 06:27", "28/02/19 06:37", "28/02/19 06:59",
"28/02/19 07:04", "28/02/19 09:30", "28/02/19 09:32", "01/03/19 05:43",
"01/03/19 05:47", "01/03/19 06:10", "01/03/19 06:11", "01/03/19 06:34",
"01/03/19 06:35", "01/03/19 07:17", "01/03/19 07:17", "01/03/19 08:24",
"01/03/19 08:26", "01/03/19 09:30", "01/03/19 09:37", "03/03/19 06:41",
"03/03/19 06:41", "03/03/19 09:37", "03/03/19 09:37", "04/03/19 05:26",
"04/03/19 05:54", "05/03/19 08:55", "05/03/19 09:12", "07/03/19 06:07",
"07/03/19 06:27", "08/03/19 05:39", "08/03/19 05:55", "08/03/19 09:37",
"08/03/19 09:51", "12/03/19 06:31", "12/03/19 06:52", "13/03/19 05:43",
"13/03/19 05:52", "26/02/19 10:59", "26/02/19 11:57", "28/02/19 05:39",
"28/02/19 05:49", "28/02/19 05:57", "28/02/19 06:55", "28/02/19 06:59",
"28/02/19 07:30", "28/02/19 07:31", "28/02/19 08:50", "28/02/19 08:51"
), dt.DMYMS.pos = structure(c(1551418740, 1551418920, 1551421380,
1551421560, 1551421680, 1551424080, 1551424200, 1551593820, 1551594000,
1551596280, 1551596400, 1551599340, 1551599400, 1551677280, 1551677460,
1551763980, 1551764160, 1552026060, 1552026240, 1552037040, 1552037280,
1552066380, 1552066740, 1552285440, 1552285560, 1552371660, 1552371840,
1552455480, 1552455600, 1551332520, 1551332760, 1551335220, 1551335820,
1551337140, 1551337440, 1551346200, 1551346320, 1551418980, 1551419220,
1551420600, 1551420660, 1551422040, 1551422100, 1551424620, 1551424620,
1551428640, 1551428760, 1551432600, 1551433020, 1551595260, 1551595260,
1551605820, 1551605820, 1551677160, 1551678840, 1551776100, 1551777120,
1551938820, 1551940020, 1552023540, 1552024500, 1552037820, 1552038660,
1552372260, 1552373520, 1552455780, 1552456320, 1551178740, 1551182220,
1551332340, 1551332940, 1551333420, 1551336900, 1551337140, 1551339000,
1551339060, 1551343800, 1551343860), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -78L), class = "data.frame")
我想通过gr
来计算组,并计算gr
中两个相同组之间的平均值(每个gr
有2个点(。
这可以用以下代码完成:
dat %>%
dplyr::group_by(id, gr) %>%
dplyr::summarise(mean.pergroup = mean(dt.DMYMS.pos)) %>%
dplyr::arrange(id, gr, mean.pergroup)
哪个给出:
`summarise()` regrouping output by 'id' (override with `.groups` argument)
# A tibble: 38 x 3
# Groups: id [2]
id gr mean.pergroup
<chr> <chr> <dttm>
1 A F18 2019-02-28 06:32:00
2 A F19 2019-02-28 07:01:30
3 A F20 2019-02-28 09:31:00
4 A F23 2019-03-01 06:25:40
5 A F24 2019-03-01 07:09:00
6 A F25 2019-03-03 06:18:30
[...]
然后我想";组";通过天,找到时间上的连续差异,并计算每天的平均差异。即,查看上面的第1、2和3行,对于第2019-02-28
天的idA
,它将是mean(07:01:30 -06:32:00 + 09:31:00 - 07:01:30) = mean(29.5 min+ 149.5min) = 89.5 min
。
我可以得到时间上的差异,但不是所有的线路:
test = dat %>%
dplyr::group_by(id, gr) %>%
dplyr::summarise(mean.pergroup = mean(dt.DMYMS.pos)) %>%
dplyr::arrange(id, gr, mean.pergroup)
test[1:3,"mean.pergroup"][3,] - test[1:3,"mean.pergroup"][2,]
提供
mean.pergroup
1 2.491667 hours
对于第3行和第2行之间的差异…
我只能想到一个for循环,但我很确定在dplyr中有一种方法可以做到这一点。
在数据集中有多行的日期为2019-02-28
,但如果只考虑连续值,则可以在data.table
中使用rleid
。
dat %>%
dplyr::group_by(id, gr) %>%
dplyr::summarise(mean.pergroup = mean(dt.DMYMS.pos)) %>%
dplyr::ungroup() %>%
dplyr::arrange(id, gr, mean.pergroup) %>%
dplyr::mutate(date = as.Date(mean.pergroup)) %>%
dplyr::group_by(grp = data.table::rleid(date)) %>%
dplyr::summarise(date = first(date),
difference = mean(difftime(mean.pergroup,lag(mean.pergroup),
units = 'mins'), na.rm = TRUE))