我有一个看起来像这样的数据帧:
df <-
data.frame(
a_1995 = 1:4,
b_1995 = 11:14,
a_1996 = 21:24,
a_1997 = 1:4,
b_1997 = 51:54,
a_1998 = 31:34,
a_1999 = 21:24)
因此,几年来,我采取了多种措施。我想创建一组新的列,这些列是当年进行的1或2次测量的平均值。我可以按照以下方式手动完成此操作以获得所需的输出:
out <-
df %>%
mutate(
avg_1995 = rowMeans(select(., contains("1995"))),
avg_1996 = rowMeans(select(., contains("1996"))),
avg_1997 = rowMeans(select(., contains("1997"))),
avg_1998 = rowMeans(select(., contains("1998"))),
avg_1999 = rowMeans(select(., contains("1999"))))
有没有一种方法可以使用purrr
或dplyr
函数实现自动化?(我有数百个这样的专栏。(
一个选项可以是:
map_dfc(.x = as.character(1995:1999),
~ df %>%
transmute(!!paste("ave", .x, sep = "_") := rowMeans(select(., contains(.x)))))
ave_1995 ave_1996 ave_1997 ave_1998 ave_1999
1 6 21 26 31 21
2 7 22 27 32 22
3 8 23 28 33 23
4 9 24 29 34 24
这里是另一个使用aggregate
的基本R解决方案
u<-aggregate(.~year,data.frame(year = gsub("\D+","avg_",names(df)),t(df)),mean)
dfout <- setNames(data.frame(t(u[-1]),row.names = NULL),u$year)
使得
> dfout
avg_1995 avg_1996 avg_1997 avg_1998 avg_1999
1 6 21 26 31 21
2 7 22 27 32 22
3 8 23 28 33 23
4 9 24 29 34 24
基本R
d = data.frame(lapply(split.default(df, gsub("\D+", "", names(df))), rowMeans), check.names = FALSE)
names(d) = paste0("avg_", names(d))
cbind(df, d)
tidyverse
library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
left_join(df %>%
mutate(rn = row_number()) %>%
gather(key, val, -rn) %>%
mutate(year = paste0("avg_", gsub("\D+", "", key))) %>%
group_by(rn, year) %>%
summarise(val = mean(val)) %>%
spread(year, val),
by = "rn") %>%
select(-rn)
下面是一个使用tidyr
和dplyr
:的解决方案
df <-
data.frame(
a_1995 = 1:4,
b_1995 = 11:14,
a_1996 = 21:24,
a_1997 = 1:4,
b_1997 = 51:54,
a_1998 = 31:34,
a_1999 = 21:24)
suppressPackageStartupMessages( library(dplyr) )
suppressPackageStartupMessages( library(tidyr) )
df %>%
pivot_longer(data = ., cols = names(.),
names_to = "type_year"
) %>%
separate(col = "type_year", into = c("type", "year"), sep = "_") %>%
group_by(year) %>%
summarise(mean_value = mean(value)) %>%
pivot_wider(names_from = year, values_from = mean_value) %>%
rename_all(~paste0("avg_", .))
#> # A tibble: 1 x 5
#> avg_1995 avg_1996 avg_1997 avg_1998 avg_1999
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 7.5 22.5 27.5 32.5 22.5
附加解决方案
result <- df %>%
mutate(n = row_number()) %>%
pivot_longer(-n) %>%
tidyr::extract(name, "year", "(\d{4})") %>%
group_by(n, year) %>%
summarise(value = mean(value, na.rm = T)) %>%
pivot_wider(n, names_from = year, values_from = value, names_prefix = "avg_") %>%
ungroup() %>%
select(-n) %>%
bind_cols(df, .)