如何(简单地)将函数应用于R中长度不同的多个子集



我需要将一个函数应用于一列中不同长度的数据子集,并生成一个新的数据帧,其中包括输出及其相关元数据。

如果不使用for循环,我怎么能做到这一点?tapply()似乎是一个很好的起点,但我很难理解语法。

例如,我有这样的东西:

block plot id species type response
    1    1  1      w     a      1.5
    1    1  2      w     a      1
    1    1  3      w     a      2
    1    1  4      w     a      1.5
    1    2  5      x     a      5
    1    2  6      x     a      6
    1    2  7      x     a      7
    1    3  8      y     b      10 
    1    3  9      y     b      11
    1    3 10      y     b      9
    1    4 11      z     b      1
    1    4 12      z     b      3
    1    4 13      z     b      2
    2    5 14      w     a      0.5
    2    5 15      w     a      1
    2    5 16      w     a      1.5
    2    6 17      x     a      3
    2    6 18      x     a      2
    2    6 19      x     a      4
    2    7 20      y     b      13 
    2    7 21      y     b      12
    2    7 22      y     b      14
    2    8 23      z     b      2
    2    8 24      z     b      3
    2    8 25      z     b      4
    2    8 26      z     b      2
    2    8 27      z     b      4

我想生产这样的东西:

block plot species type mean.response
    1    1       w    a           1.5
    1    2       x    a           6
    1    3       y    b           10 
    1    4       z    b           2
    2    5       w    a           1
    2    6       x    a           3
    2    7       y    b           13
    2    8       z    b           3

试试这个。您可以使用group_by()来设置分组变量,然后使用summarise()来计算期望的变量。这里的代码使用dplyr:

library(dplyr)
#Code
newdf <- df %>% group_by(block,plot,species,type) %>% summarise(Mean=mean(response,na.rm=T))

输出:

# A tibble: 8 x 5
# Groups:   block, plot, species [8]
  block  plot species type   Mean
  <int> <int> <chr>   <chr> <dbl>
1     1     1 w       a       1.5
2     1     2 x       a       6  
3     1     3 y       b      10  
4     1     4 z       b       2  
5     2     5 w       a       1  
6     2     6 x       a       3  
7     2     7 y       b      13  
8     2     8 z       b       3  

或者使用base R(-3用于在聚合中省略id变量(:

#Base R
newdf <- aggregate(response~.,data=df[,-3],mean,na.rm=T)

输出:

  block plot species type response
1     1    1       w    a      1.5
2     2    5       w    a      1.0
3     1    2       x    a      6.0
4     2    6       x    a      3.0
5     1    3       y    b     10.0
6     2    7       y    b     13.0
7     1    4       z    b      2.0
8     2    8       z    b      3.0

使用的一些数据:

#Data
df <- structure(list(block = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L), plot = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L
), id = 1:27, species = c("w", "w", "w", "w", "x", "x", "x", 
"y", "y", "y", "z", "z", "z", "w", "w", "w", "x", "x", "x", "y", 
"y", "y", "z", "z", "z", "z", "z"), type = c("a", "a", "a", "a", 
"a", "a", "a", "b", "b", "b", "b", "b", "b", "a", "a", "a", "a", 
"a", "a", "b", "b", "b", "b", "b", "b", "b", "b"), response = c(1.5, 
1, 2, 1.5, 5, 6, 7, 10, 11, 9, 1, 3, 2, 0.5, 1, 1.5, 3, 2, 4, 
13, 12, 14, 2, 3, 4, 2, 4)), class = "data.frame", row.names = c(NA, 
-27L))

如果输入dd在末尾的注释中可重复给定,请使用其中任何一个:

# 1. aggregate.formula - base R
# Can use just response on left hand side if header doesn't matter.
aggregate(cbind(mean.response = response) ~ block + plot + species + type, dd, mean)
# 2. aggregate.default - base R
v <- c("block", "plot", "species", "type")
aggregate(list(mean.response = dd$response), dd[v], mean)
# 3. sqldf
library(sqldf)
sqldf("select block, plot, species, type, avg(response) as [mean.response]
  from dd group by 1, 2, 3, 4")
# 4. data.table
library(data.table)
v <- c("block", "plot", "species", "type")
as.data.table(dd)[, .(mean.response = mean(response)), by = v]
# 5. doBy - last column of output will be labelled response.mean
library(doBy)
summaryBy(response ~ block + plot + species + type, dd)

备注

可复制形式的输入:

dd <- structure(list(block = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L), plot = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L
), id = 1:27, species = c("w", "w", "w", "w", "x", "x", "x", 
"y", "y", "y", "z", "z", "z", "w", "w", "w", "x", "x", "x", "y", 
"y", "y", "z", "z", "z", "z", "z"), type = c("a", "a", "a", "a", 
"a", "a", "a", "b", "b", "b", "b", "b", "b", "a", "a", "a", "a", 
"a", "a", "b", "b", "b", "b", "b", "b", "b", "b"), response = c(1.5, 
1, 2, 1.5, 5, 6, 7, 10, 11, 9, 1, 3, 2, 0.5, 1, 1.5, 3, 2, 4, 
13, 12, 14, 2, 3, 4, 2, 4)), class = "data.frame", row.names = c(NA, 
-27L))

相关内容

最新更新