R语言 通过基于组移动开始时间和结束时间来重新格式化数据帧,以便在新数据帧中开始和结束时间,而无需循环



我有一组工作代码,使用下面的 for 循环来处理数据帧,但如果可能的话,需要在没有 for 循环的情况下对其进行优化。 我已经搜索了一段时间才能找到这样的东西,但一定不知道正确的搜索词。感谢您的任何帮助。

数据帧示例(底部的较长版本)具有日期时间列和瓶子列。瓶子列从某个数字(下面的 1)开始,并在添加样品时重复并切换到 2,依此类推,直到瓶子 7(在本例中),然后从 1 重新开始并转到 14(在本例中)并一遍又一遍。(注意,真实文件中每瓶有2次以上)

datetime        bottle
6/9/2016 0:00   1
6/9/2016 0:15   1
6/9/2016 0:30   1
6/9/2016 0:45   1
6/9/2016 1:00   2
6/9/2016 1:15   2
6/9/2016 1:30   2
6/9/2016 1:45   3
6/9/2016 2:00   3
6/9/2016 2:15   4
6/9/2016 2:30   4
6/9/2016 2:45   5
6/9/2016 3:00   5
6/9/2016 3:15   6
6/9/2016 3:30   6
6/9/2016 3:45   7
6/9/2016 4:00   7
6/9/2016 4:15   7
6/9/2016 4:30   1
6/9/2016 4:45   1
6/9/2016 5:00   1
6/9/2016 5:15   2
6/9/2016 5:30   2
6/9/2016 5:45   2
6/9/2016 6:00   3
6/9/2016 6:15   3
6/9/2016 6:30   3

我需要创建一个包含瓶子开始和结束时间的新数据帧。请注意,每个瓶子序列都是重复的。

bottle begin         end
1   6/9/2016 0:00   6/9/2016 0:45
2   6/9/2016 1:00   6/9/2016 1:30
3   6/9/2016 1:45   6/9/2016 2:00
4   6/9/2016 2:15   6/9/2016 2:30
5   6/9/2016 2:45   6/9/2016 3:00
6   6/9/2016 3:15   6/9/2016 3:30
7   6/9/2016 3:45   6/9/2016 4:15
1   6/9/2016 4:30   6/9/2016 5:00
2   6/9/2016 5:15   6/9/2016 5:45
3   6/9/2016 6:00   6/9/2016 6:30

到目前为止,我所做的是下面的注释代码。这运行良好,但在整个数据帧上需要很长时间。

#create id number for each bottle using data.table
setDT(t2s_bottle_timing.df)[, id := .GRP, by = t2s_bottle]
#declare/set variables
x1 <- 1
x2 <- 1
x3 <- 1
i <- 1
N <- length(t2s_bottle_timing.df$t2s_bottle)
#renumber id column to have unique id for each bottle run
for (i in 2:(N-1)) {
x1 <- t2s_bottle_timing.df[(i) , 2] #load bottle numbers
x2 <- t2s_bottle_timing.df[(i+1) , 2] #load bottle numbers
if (x2 == x1)  {   t2s_bottle_timing.df[(i),3] <- x3 } #set id number
if (x2 != x1)  {   x3 <- x3 +1} #increment id number
t2s_bottle_timing.df[(i+1),3] <- x3 #load new id number into table
}
# get rid of unused stuff
rm(x1, x2, i, N, x3)
# summerise the raw dataframe to produce the bottle, begin, end dataframe
t2s_timing_output.df <- t2s_bottle_timing.df %>% group_by( id ,t2s_bottle ) 
%>%  #group_by(id,bottle)
summarize(
begin = min(datetime),
end = max(datetime) )

所以这行得通,但我渴望学习另一种方法和更有效的方法。

t2s_bottle_timing.df <- structure(list(datetime = structure(c(1465514100, 1465515000, 
1465515900, 1465516800, 1465517700, 1465518600, 1465519500, 1465520400, 
1465521300, 1465522200, 1465523100, 1465524000, 1465524900, 1465525800, 
1465526700, 1465527600, 1465528500, 1465529400, 1465530300, 1465531200, 
1465532100, 1465533000, 1465533900, 1465534800, 1465535700, 1465536600, 
1465537500, 1465538400, 1465539300, 1465540200, 1465541100, 1465542000, 
1465542900, 1465543800, 1465544700, 1465545600, 1465546500, 1465547400, 
1465548300, 1465549200, 1465550100, 1465551000, 1465551900, 1465552800, 
1465553700, 1465554600, 1465555500, 1465556400, 1465557300, 1465558200, 
1465559100, 1465560000, 1465560900, 1465561800, 1465562700, 1465563600, 
1465564500, 1465565400, 1465566300, 1465567200, 1465568100, 1465569000, 
1465569900, 1465570800, 1465571700, 1465572600, 1465573500, 1465574400, 
1465575300, 1465576200, 1465577100, 1465578000, 1465578900, 1465579800, 
1465580700, 1465581600, 1465582500, 1465583400, 1465584300, 1465585200, 
1465586100, 1465587000, 1465587900, 1465588800, 1465589700, 1465590600, 
1465591500), tzone = "UTC", class = c("POSIXct", "POSIXt")), 
t2s_bottle = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 
7L, 7L, 7L, 7L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 
6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L)), .Names = c("datetime", "t2s_bottle"), row.names = c(NA, 
-87L), spec = structure(list(cols = structure(list(datetime = structure(list(), class = c("collector_character", 
"collector")), t2s_bottle = structure(list(), class = c("collector_integer", 
"collector"))), .Names = c("datetime", "t2s_bottle")), default = structure(list(), class = c("collector_guess", 
"collector"))), .Names = c("cols", "default"), class = "col_spec"), class = c("tbl_df", 
"tbl", "data.frame"))

你的例子让我有点困惑,但如果你想要创建一个索引,也许cumsum逻辑可能会有所帮助:

t2s_bottle_timing.df %>% 
mutate(index = cumsum(t2s_bottle != dplyr::lag(t2s_bottle, default = 0))) %>% 
group_by(index, t2s_bottle) %>% 
summarise(begin = min(datetime), end = max(datetime))
index t2s_bottle               begin                 end
<int>      <int>              <dttm>              <dttm>
1      1          1 2016-06-09 23:15:00 2016-06-10 00:15:00
2      2          2 2016-06-10 00:30:00 2016-06-10 02:15:00
3      3          3 2016-06-10 02:30:00 2016-06-10 04:30:00
4      4          4 2016-06-10 04:45:00 2016-06-10 06:00:00
5      5          5 2016-06-10 06:15:00 2016-06-10 07:45:00
6      6          6 2016-06-10 08:00:00 2016-06-10 09:00:00
7      7          7 2016-06-10 09:15:00 2016-06-10 10:15:00
8      8          1 2016-06-10 10:30:00 2016-06-10 11:15:00
9      9          2 2016-06-10 11:30:00 2016-06-10 13:00:00
10    10          3 2016-06-10 13:15:00 2016-06-10 13:30:00
11    11          4 2016-06-10 13:45:00 2016-06-10 15:00:00
12    12          5 2016-06-10 15:15:00 2016-06-10 15:45:00
13    13          6 2016-06-10 16:00:00 2016-06-10 17:15:00
14    14          7 2016-06-10 17:30:00 2016-06-10 18:45:00
15    15          8 2016-06-10 19:00:00 2016-06-10 20:45:00

最新更新