r-使用"dplyr"或"purrr"来获取共享字符串片段(例如年份)的多列的平



我有一个看起来像这样的数据帧:

df <- 
data.frame(
a_1995 = 1:4,
b_1995 = 11:14,
a_1996 = 21:24,
a_1997 = 1:4,
b_1997 = 51:54,
a_1998 = 31:34,
a_1999 = 21:24)

因此,几年来,我采取了多种措施。我想创建一组新的列,这些列是当年进行的1或2次测量的平均值。我可以按照以下方式手动完成此操作以获得所需的输出:

out <- 
df %>% 
mutate(
avg_1995 = rowMeans(select(., contains("1995"))),
avg_1996 = rowMeans(select(., contains("1996"))),
avg_1997 = rowMeans(select(., contains("1997"))),
avg_1998 = rowMeans(select(., contains("1998"))),
avg_1999 = rowMeans(select(., contains("1999"))))

有没有一种方法可以使用purrrdplyr函数实现自动化?(我有数百个这样的专栏。(

一个选项可以是:

map_dfc(.x = as.character(1995:1999), 
~ df %>%
transmute(!!paste("ave", .x, sep = "_") := rowMeans(select(., contains(.x)))))
ave_1995 ave_1996 ave_1997 ave_1998 ave_1999
1        6       21       26       31       21
2        7       22       27       32       22
3        8       23       28       33       23
4        9       24       29       34       24

这里是另一个使用aggregate的基本R解决方案

u<-aggregate(.~year,data.frame(year = gsub("\D+","avg_",names(df)),t(df)),mean)
dfout <- setNames(data.frame(t(u[-1]),row.names = NULL),u$year)

使得

> dfout
avg_1995 avg_1996 avg_1997 avg_1998 avg_1999
1        6       21       26       31       21
2        7       22       27       32       22
3        8       23       28       33       23
4        9       24       29       34       24

基本R

d = data.frame(lapply(split.default(df, gsub("\D+", "", names(df))), rowMeans), check.names = FALSE)
names(d) = paste0("avg_", names(d))
cbind(df, d)

tidyverse

library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
left_join(df %>%
mutate(rn = row_number()) %>%
gather(key, val, -rn) %>%
mutate(year = paste0("avg_", gsub("\D+", "", key))) %>%
group_by(rn, year) %>%
summarise(val = mean(val)) %>%
spread(year, val),
by = "rn") %>%
select(-rn)

下面是一个使用tidyrdplyr:的解决方案

df <- 
data.frame(
a_1995 = 1:4,
b_1995 = 11:14,
a_1996 = 21:24,
a_1997 = 1:4,
b_1997 = 51:54,
a_1998 = 31:34,
a_1999 = 21:24)
suppressPackageStartupMessages( library(dplyr) )
suppressPackageStartupMessages( library(tidyr) )
df %>% 
pivot_longer(data = ., cols = names(.),
names_to = "type_year"
) %>% 
separate(col = "type_year", into = c("type", "year"), sep = "_") %>% 
group_by(year) %>% 
summarise(mean_value = mean(value)) %>% 
pivot_wider(names_from = year, values_from = mean_value) %>% 
rename_all(~paste0("avg_", .))
#> # A tibble: 1 x 5
#>   avg_1995 avg_1996 avg_1997 avg_1998 avg_1999
#>      <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
#> 1      7.5     22.5     27.5     32.5     22.5

附加解决方案

result <- df %>% 
mutate(n = row_number()) %>% 
pivot_longer(-n) %>% 
tidyr::extract(name, "year", "(\d{4})") %>% 
group_by(n, year) %>% 
summarise(value = mean(value, na.rm = T)) %>% 
pivot_wider(n, names_from = year, values_from = value, names_prefix = "avg_") %>% 
ungroup() %>% 
select(-n) %>% 
bind_cols(df, .)

最新更新