我有一个包含4个变量的数据帧:天(日期,格式:"YYYY-MM-DD">(,小时(POSIXct,格式:"YYYY-MM-DD hh:mm:ss"(,部门(chr(和数量(数字(。
df <- structure(list(
day = structure(c(18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116), class = "Date"),
hour = structure(c(1565275500, 1565276400, 1565277300, 1565278200, 1565279100, 1565280000, 1565280900, 1565281800, 1565282700, 1565275500, 1565276400, 1565277300, 1565278200, 1565279100, 1565280000, 1565280900, 1565281800, 1565282700), class = c("POSIXct", "POSIXt"), tzone = ""),
department = c("DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2"),
amount = c(2, 3, 3, 2, 0, 0, 1, 2, 1, 3, 3, 3, 2, 2, 3, 0, 0, 0), max_cond = c(3, 3, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 3, 0, 0, 0)), row.names = c(NA, -18L), class = "data.frame")
对于 data.frame 的每一行,我想获得金额的最大值,按天和部门分组,但仅适用于一天中大于或等于相应行小时的小时数。
换句话说,对于每个观察 [day_i、hour_i、department_i],我想要得到:max(amount|(天==day_i(&(部门==department_i(和(小时>=hour_i((。
对于上面的例子,我们应该有:
day hour department amount max_cond
1 2019-08-08 2019-08-08 11:45:00 DPT1 2 3
2 2019-08-08 2019-08-08 12:00:00 DPT1 3 3
3 2019-08-08 2019-08-08 12:15:00 DPT1 3 3
4 2019-08-08 2019-08-08 12:30:00 DPT1 2 2
5 2019-08-08 2019-08-08 12:45:00 DPT1 0 2
6 2019-08-08 2019-08-08 13:00:00 DPT1 0 2
7 2019-08-08 2019-08-08 13:15:00 DPT1 1 2
8 2019-08-08 2019-08-08 13:30:00 DPT1 2 2
9 2019-08-08 2019-08-08 13:45:00 DPT1 1 1
10 2019-08-08 2019-08-08 11:45:00 DPT2 3 3
11 2019-08-08 2019-08-08 12:00:00 DPT2 3 3
12 2019-08-08 2019-08-08 12:15:00 DPT2 3 3
13 2019-08-08 2019-08-08 12:30:00 DPT2 2 3
14 2019-08-08 2019-08-08 12:45:00 DPT2 2 3
15 2019-08-08 2019-08-08 13:00:00 DPT2 3 3
16 2019-08-08 2019-08-08 13:15:00 DPT2 0 0
17 2019-08-08 2019-08-08 13:30:00 DPT2 0 0
18 2019-08-08 2019-08-08 13:45:00 DPT2 0 0
非常相似,但使用data.table
你可以做到:
library(data.table)
df <- structure(list(
day = structure(c(18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116, 18116), class = "Date"),
hour = structure(c(1565275500, 1565276400, 1565277300, 1565278200, 1565279100, 1565280000, 1565280900, 1565281800, 1565282700, 1565275500, 1565276400, 1565277300, 1565278200, 1565279100, 1565280000, 1565280900, 1565281800, 1565282700), class = c("POSIXct", "POSIXt"), tzone = ""),
department = c("DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT1", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2", "DPT2"),
amount = c(2, 3, 3, 2, 0, 0, 1, 2, 1, 3, 3, 3, 2, 2, 3, 0, 0, 0), max_cond = c(3, 3, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 3, 0, 0, 0)), row.names = c(NA, -18L), class = "data.frame")
dt = data.table(df)
setorder(dt, -hour)
dt[,max_cond_new:=cummax(amount),by=.(day,department)]
setorder(dt, department, hour)
希望这有帮助!
一种base
R方法:您可以使用cummax()
(暨最大imum(来解决这个问题。请注意,我假设您的数据框已排序hour
s,您的示例中就是这种情况。
其思路是:首先将数据框split()
为具有不同date
和department
s的组件。然后,在每个组件中:
- 反转相关向量,
$day
- 使用
cummax()
构造$max_cond
变量(反向(
将 $max_cond
变量翻转回正确的顺序
然后,将所有组件与do.call()
粘合在一起,然后rbind()
。
对于您的示例:
df2 <- split(df, list(df$department, df$day))
df2 <- lapply(df2, function(x) {
x$max_cond <- x[order(x$hour, decreasing = T), ]$amount %>%
cummax %>%
sort(decreasing = T)
x
})
df2 <- do.call(rbind, df2)
row.names(df2) <- NULL
df2
## day hour department amount max_cond
## 1 2019-08-08 2019-08-08 10:45:00 DPT1 2 3
## 2 2019-08-08 2019-08-08 11:00:00 DPT1 3 3
## 3 2019-08-08 2019-08-08 11:15:00 DPT1 3 3
## 4 2019-08-08 2019-08-08 11:30:00 DPT1 2 2
## 5 2019-08-08 2019-08-08 11:45:00 DPT1 0 2
## 6 2019-08-08 2019-08-08 12:00:00 DPT1 0 2
## 7 2019-08-08 2019-08-08 12:15:00 DPT1 1 2
## 8 2019-08-08 2019-08-08 12:30:00 DPT1 2 2
## 9 2019-08-08 2019-08-08 12:45:00 DPT1 1 1
## 10 2019-08-08 2019-08-08 10:45:00 DPT2 3 3
## 11 2019-08-08 2019-08-08 11:00:00 DPT2 3 3
## 12 2019-08-08 2019-08-08 11:15:00 DPT2 3 3
## 13 2019-08-08 2019-08-08 11:30:00 DPT2 2 3
## 14 2019-08-08 2019-08-08 11:45:00 DPT2 2 3
## 15 2019-08-08 2019-08-08 12:00:00 DPT2 3 3
## 16 2019-08-08 2019-08-08 12:15:00 DPT2 0 0
## 17 2019-08-08 2019-08-08 12:30:00 DPT2 0 0
## 18 2019-08-08 2019-08-08 12:45:00 DPT2 0 0