在R中,我有一些生态数据,格式如下:
sample <- seq(1, 20, by=1)
group <- c("A","A","A","B","B","C","D","E","E","E","E","E","E",
"E","E","E","E","F","F","F")
df <- data.frame(sample, group)
其中sample
是样本编号,group
是与每个样本相关的不同分类组
总共,我有20个样本(实际上更多(,我可以用获得某个组的相对频率
data.frame(表(组(/长度(组((
group Freq
1 A 0.15
2 B 0.10
3 C 0.05
4 D 0.05
5 E 0.50
6 F 0.15
现在我想对我的数据帧进行100次子采样(10个样本(,并得到每组的平均相对频率和标准偏差。
我该怎么做?
您可以使用以下代码
data <- with(
df,
proportions(
replicate(
100,
table(
factor(Group[sample(Sample, 10)], levels = unique(Group))
)
), 2
)
)
以获得
> data
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
A 0.2 0.3 0.2 0.1 0.1 0.1 0.3 0.2 0.1 0.2 0.1 0.2 0.2 0.2
B 0.1 0.2 0.0 0.2 0.2 0.1 0.1 0.0 0.2 0.1 0.1 0.1 0.1 0.0
C 0.1 0.0 0.0 0.1 0.1 0.1 0.0 0.1 0.1 0.1 0.0 0.1 0.0 0.1
D 0.0 0.0 0.1 0.0 0.1 0.1 0.0 0.1 0.0 0.0 0.0 0.1 0.0 0.1
E 0.4 0.5 0.5 0.4 0.5 0.6 0.6 0.5 0.3 0.5 0.7 0.4 0.7 0.5
F 0.2 0.0 0.2 0.2 0.0 0.0 0.0 0.1 0.3 0.1 0.1 0.1 0.0 0.1
[,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
A 0.1 0.0 0.1 0.2 0.1 0.2 0.1 0.3 0.2 0.3 0.1 0.1
B 0.1 0.1 0.1 0.1 0.1 0.1 0.0 0.1 0.1 0.1 0.2 0.1
C 0.1 0.0 0.1 0.1 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1
D 0.1 0.0 0.0 0.1 0.0 0.1 0.1 0.1 0.1 0.0 0.1 0.1
E 0.5 0.6 0.6 0.3 0.5 0.5 0.6 0.4 0.5 0.4 0.4 0.5
F 0.1 0.3 0.1 0.2 0.2 0.1 0.2 0.1 0.1 0.2 0.2 0.1
[,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38]
A 0.1 0.2 0.2 0.1 0.2 0.2 0.0 0.3 0.2 0.1 0.0 0.2
B 0.1 0.1 0.2 0.1 0.1 0.0 0.1 0.2 0.1 0.1 0.1 0.0
C 0.0 0.0 0.0 0.1 0.0 0.1 0.1 0.1 0.1 0.0 0.1 0.0
D 0.1 0.1 0.0 0.1 0.1 0.1 0.1 0.1 0.0 0.0 0.1 0.1
E 0.5 0.3 0.4 0.5 0.4 0.5 0.6 0.2 0.4 0.5 0.4 0.6
F 0.2 0.3 0.2 0.1 0.2 0.1 0.1 0.1 0.2 0.3 0.3 0.1
[,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49] [,50]
A 0.1 0.2 0.1 0.2 0.2 0.1 0.1 0.1 0.2 0.2 0.2 0.1
B 0.0 0.1 0.0 0.1 0.1 0.2 0.0 0.0 0.2 0.1 0.1 0.1
C 0.0 0.1 0.1 0.1 0.0 0.1 0.0 0.0 0.1 0.1 0.0 0.0
D 0.0 0.1 0.0 0.0 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
E 0.7 0.4 0.5 0.4 0.5 0.5 0.7 0.8 0.2 0.4 0.4 0.6
F 0.2 0.1 0.3 0.2 0.1 0.0 0.1 0.0 0.2 0.1 0.2 0.1
[,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62]
A 0.1 0.0 0.0 0.2 0.3 0.0 0.2 0.2 0.2 0.1 0.1 0.2
B 0.1 0.2 0.2 0.1 0.0 0.2 0.0 0.1 0.2 0.2 0.2 0.1
C 0.0 0.0 0.1 0.0 0.1 0.1 0.0 0.0 0.0 0.0 0.1 0.0
D 0.1 0.1 0.0 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.0
E 0.5 0.5 0.6 0.5 0.3 0.4 0.4 0.5 0.4 0.5 0.4 0.5
F 0.2 0.2 0.1 0.1 0.2 0.2 0.3 0.1 0.1 0.1 0.1 0.2
[,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73] [,74]
A 0.2 0.3 0.2 0.1 0.2 0.1 0.2 0.3 0.3 0.1 0.2 0.2
B 0.0 0.0 0.1 0.2 0.1 0.1 0.1 0.1 0.0 0.0 0.0 0.1
C 0.0 0.0 0.1 0.0 0.0 0.1 0.0 0.0 0.0 0.1 0.1 0.1
D 0.0 0.1 0.0 0.0 0.1 0.1 0.1 0.0 0.1 0.0 0.0 0.1
E 0.6 0.4 0.6 0.5 0.4 0.4 0.5 0.6 0.5 0.6 0.6 0.4
F 0.2 0.2 0.0 0.2 0.2 0.2 0.1 0.0 0.1 0.2 0.1 0.1
[,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85] [,86]
A 0.1 0.2 0.1 0.2 0.2 0.0 0.2 0.1 0.2 0.0 0.1 0.1
B 0.1 0.1 0.2 0.2 0.2 0.1 0.1 0.1 0.2 0.2 0.1 0.1
C 0.1 0.0 0.1 0.0 0.0 0.1 0.1 0.0 0.1 0.1 0.1 0.0
D 0.1 0.1 0.0 0.0 0.1 0.1 0.1 0.1 0.0 0.1 0.0 0.0
E 0.4 0.5 0.6 0.6 0.4 0.5 0.4 0.4 0.5 0.4 0.4 0.6
F 0.2 0.1 0.0 0.0 0.1 0.2 0.1 0.3 0.0 0.2 0.3 0.2
[,87] [,88] [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97] [,98]
A 0.2 0.3 0.2 0.1 0.1 0.2 0.2 0.1 0.2 0.1 0.3 0.3
B 0.0 0.0 0.1 0.2 0.0 0.2 0.1 0.1 0.0 0.0 0.0 0.1
C 0.1 0.1 0.0 0.1 0.0 0.1 0.1 0.1 0.0 0.1 0.0 0.0
D 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.1 0.1 0.0
E 0.7 0.5 0.6 0.5 0.7 0.5 0.5 0.6 0.7 0.6 0.4 0.4
F 0.0 0.1 0.1 0.1 0.2 0.0 0.1 0.1 0.1 0.1 0.2 0.2
[,99] [,100]
A 0.1 0.2
B 0.1 0.2
C 0.0 0.0
D 0.0 0.1
E 0.7 0.4
F 0.1 0.1
基于所实现的data
,可以通过获得mean
和sd
mean
分组
> rowMeans(data)
A B C D E F
0.167 0.096 0.051 0.056 0.492 0.138
sd
分组
> apply(data, 1, sd)
A B C D E F
0.08577631 0.07035265 0.05000000 0.05016136 0.12583057 0.07869517
查看tidyverse
(purrr
和dplyr
(以获得函数方法。不确定你想对标准偏差做什么:
library(tidyverse)
times <- 100
subpopulation <- 21
sample_summary <- function(time, df_in = df, subpop = subpopulation){
df_temp <- df_in[sample(1:nrow(df_in), size = subpop, replace = TRUE),]
df_summary <- df_temp %>% group_by(group) %>% summarize(mean_freq = n() / subpop)
df_summary$experiment <- time
return(df_summary)
}
1:times %>%
map_dfr(., sample_summary)
不太清楚,但在基本R中有这样的东西吗?想法是在一个列表中创建100个样本,lapply()
对每个元素进行相对频率计算,最后将其放在data.frame()
中,以聚合和计算mean()
和sd()
。
# first an empty list
listed <- list()
# now you create a data.frame with all the groups in unique()
unique_groups <- data.frame(group = unique(df$group))
# now let's populate it:
# set seed for sake of reproducibility
set.seed(1234)
for(i in 1:100){
# sampling
temp <- df[sample(nrow(df), 10), ]
# merge with the unique data frame
temp <- merge(unique_groups, temp, by = 'group', all.x = T)
# replace NAs with 0s
temp[is.na(temp)] <- 0
# put it in list
listed[[i]] <- temp
}
# here you apply to each element of the list your frequency calc
listed_freq <- lapply(listed, function(x) data.frame(table(x$group)/length(x$group)) )
# put it as data.frame
df_freq <- do.call(rbind, listed_freq)
# here you aggregate and calculate mean and sd
aggregate(. ~ Var1, data = df_freq, FUN = function(x) c(mn = mean(x), stdev = sd(x) ) )
结果:
Var1 Freq.mn Freq.stdev
1 A 0.17333333 0.05958659
2 B 0.15362319 0.05023389
3 C 0.10000000 0.00000000
4 D 0.10000000 0.00000000
5 E 0.47300000 0.11621558
6 F 0.16813187 0.06972824