如何在 R 中获得前两年的最大值



嗨,我有数据框

如何使用过去 2 年的最大值创建列max_value

dt <- 
structure(list(Name = c("A", "A", "A", "A", "A", "A", "A", "B", 
"B", "B", "B", "B", "B"), year = c(2012L, 2012L, 2013L, 2014L, 
2015L, 2016L, 2017L, 2012L, 2013L, 2013L, 2014L, 2015L, 2016L
), value = c(22L, 99L, 12L, 1L, 23L, 40L, 12L, 12L, 33L, 40L, 
NA, 20L, 20L), max_value = c(NA, NA, 99L, 99L, 12L, 23L, 40L, 
NA, 12L, 12L, 40L, 40L, 20L)), .Names = c("Name", "year", "value", 
"max_value"), row.names = c(NA, -13L), class = c("data.table", 
"data.frame"))
Name   year   value   *max_value*
A      2012    22        NA
A      2012    99        NA
A      2013    12        99
A      2014    01        99
A      2015    23        12
A      2016    40        23
A      2017    12        40
B      2012    12        NA
B      2013    33        12
B      2013    40        12
B      2014    NA        40
B      2015    20        40
B      2016    20        20

提前致谢

这是另一种 data.table 方法,它使用自连接 by.EACHI

library(data.table)
# temporary rowwise id
setDT(dt)[, id := .I]
# set key
setkey(dt, id)
# self join, set infinite values back to NA
dt[dt, max_val2 := {
dt[Name == i.Name & year >= (i.year - 2) & year < i.year, max(value, na.rm = TRUE)]
}, by = .EACHI][is.infinite(max_val2), max_val2 := NA][, id := NULL]
Name year value max_value max_val2
1:    A 2012    22        NA       NA
2:    A 2012    99        NA       NA
3:    A 2013    12        99       99
4:    A 2014     1        99       99
5:    A 2015    23        12       12
6:    A 2016    40        23       23
7:    A 2017    12        40       40
8:    B 2012    12        NA       NA
9:    B 2013    33        12       12
10:    B 2013    40        12       12
11:    B 2014    NA        40       40
12:    B 2015    20        40       NA
13:    B 2016    20        20       NA

这里的复杂之处在于年份的条目数量不同。这可以通过分组来解决,然后汇总以获得一年内的最大值。接下来使用slide_int()获取前两年的最大值,最终将此年度数据连接回原始帧:

library(tidyverse)
library(slider)
sol <- dt %>%
group_by(Name, year) %>%
summarize(max_val = max(value, na.rm = T)) %>% 
mutate(slide_max = slide_int(lag(max_val), max, 
.before = 1, na.rm = TRUE, .complete = T)) %>%
select(-max_val) %>%
right_join(dt, by = c("Name", "year"))
sol
#> # A tibble: 13 × 5
#> # Groups:   Name [2]
#>    Name   year slide_max value max_value
#>    <chr> <int>     <int> <int>     <int>
#>  1 A      2012        NA    22        NA
#>  2 A      2012        NA    99        NA
#>  3 A      2013        99    12        99
#>  4 A      2014        99     1        99
#>  5 A      2015        12    23        12
#>  6 A      2016        23    40        23
#>  7 A      2017        40    12        40
#>  8 B      2012        NA    12        NA
#>  9 B      2013        12    33        12
#> 10 B      2013        12    40        12
#> 11 B      2014        40    NA        40
#> 12 B      2015        40    20        40
#> 13 B      2016        20    20        20
identical(sol$slide_max, sol$max_value)
#> [1] TRUE

这是一个使用聚合、二维shiftapply和连接的方法。

library(data.table)
dt[dt[, .(mx=max(value)), by=c("Name", "year")
][, .(year,
max_val=apply(matrix(unlist(shift(mx, 1:2)), ncol=2), 1, max, na.rm=TRUE)),
by=Name],
on=c("Name", "year")][is.infinite(max_val), max_val := NA][]

第一行按年份和名称计算最大值。返回第二行,对于每个名称、年份和 usingapply两个滞后年份(使用shift(mx, 1:2))中的最大值,删除 NA 值。这会导致具有 2 个 NA 值的每一行发出警告,并在该位置返回 -Inf。我不得不手动将shift的输出转换为矩阵,以便将其提供给应用程序,这并不理想。生成的 data.table 使用名称和年份作为 ID 连接到原始表。最后,-Inf 值在最后一行中替换为 NA,并用[]打印结果。

这返回

Name year value max_value max_val
1:    A 2012    22        NA      NA
2:    A 2012    99        NA      NA
3:    A 2013    12        99      99
4:    A 2014     1        99      99
5:    A 2015    23        12      12
6:    A 2016    40        23      23
7:    A 2017    12        40      40
8:    B 2012    12        NA      NA
9:    B 2013    33        12      12
10:    B 2013    40        12      12
11:    B 2014    NA        40      40
12:    B 2015    20        40      40
13:    B 2016    20        20      20

数据

dt <- 
structure(list(Name = c("A", "A", "A", "A", "A", "A", "A", "B", 
"B", "B", "B", "B", "B"), year = c(2012L, 2012L, 2013L, 2014L, 
2015L, 2016L, 2017L, 2012L, 2013L, 2013L, 2014L, 2015L, 2016L
), value = c(22L, 99L, 12L, 1L, 23L, 40L, 12L, 12L, 33L, 40L, 
NA, 20L, 20L), max_value = c(NA, NA, 99L, 99L, 12L, 23L, 40L, 
NA, 12L, 12L, 40L, 40L, 20L)), .Names = c("Name", "year", "value", 
"max_value"), row.names = c(NA, -13L), class = c("data.table", 
"data.frame"))

这是我到目前为止得到的。滑块包是整洁的,

library(slider)
dt %>% group_by(Name) %>% 
mutate(slide_max = slide_dbl(lag(value), max, .before = 2, na.rm = TRUE, .complete = T))
# A tibble: 13 x 5
# Groups:   Name [2]
Name   year value max_value slide_max
<chr> <int> <int>     <int>     <dbl>
1 A      2012    22        NA        NA
2 A      2012    99        NA        NA
3 A      2013    12        99        99
4 A      2014     1        99        99
5 A      2015    23        12        99
6 A      2016    40        23        23
7 A      2017    12        40        40
8 B      2012    12        NA        NA
9 B      2013    33        12        NA
10 B      2013    40        12        33
11 B      2014    NA        40        40
12 B      2015    20        40        40
13 B      2016    20        20        40

下面是使用 mapply 的基本 R 解决方案。

df <- data.frame(Name = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B"),
year = c(2012, 2012, 2013, 2014, 2015, 2016, 2017, 2012, 2013, 2013, 2014, 2015, 2016),
value = c(22, 99, 12, 1, 23, 40, 12, 12, 33, 40, NA, 20, 20),
stringsAsFactors = FALSE)
max.vals <- mapply(function(x, y){
vals <- df[df$year %in% c(x-2,x-1) & df$Name == y,"value"]
max.val <- ifelse(length(vals) > 0, max(vals, na.rm = TRUE), NA)
max.val <- list(y,x,max.val)
names(max.val) <- c("Name","year","max_value")
return(max.val)
},
unique(df[,c("Name","year")])$year,
unique(df[,c("Name","year")])$Name
) 
max.vals <- as.data.frame(t(max.vals),stringsAsFactors = FALSE)
df <- merge(df, max.vals)

使用by

> by(dat$value, dat$year, function(x) max(x))
dat$year: 2012
[1] 99
------------------------------------------------------------ 
dat$year: 2013
[1] 40
------------------------------------------------------------ 
dat$year: 2014
[1] NA
------------------------------------------------------------ 
dat$year: 2015
[1] 23
------------------------------------------------------------ 
dat$year: 2016
[1] 40
------------------------------------------------------------ 
dat$year: 2017
[1] 12

编辑:一开始误解了这个问题。这应该是您想要的:

将结果分配给数据帧:

> dat1=by(dat$value, dat$year, function(x) max(x))
> data.frame("max"=dat1[1:length(dat1)])
max
2012  99
2013  40
2014  NA
2015  23
2016  40
2017  12

创建一个新的数据框来保存两年一次的最大值,并循环以比较年份:

bi_max=data.frame("max"=nrow(dat_max))
for(i in 1:nrow(dat_max)){
bi_max[i,]=max(dat_max$max[i], dat_max$max[i-1], na.rm=T)
}
rownames(bi_max)=rownames(dat_max)

最终结果:

> bi_max
max
2012  99
2013  99
2014  40
2015  23
2016  40
2017  40

最新更新