生成分类数据的置信区间

  • 本文关键字:区间 数据 分类 r
  • 更新时间 :
  • 英文 :


我正试图为我的数据计算二项比例置信区间。这里我只使用虹膜数据集。

# break the length into different size
iris$Slen <- cut(iris$Sepal.Length, 
breaks = c(4,5,6,8))
#need to compute confidence interval for binomial proportion within each group and size bin
alpha <- as.numeric(0.05) 
#function to calculate conf.int.
CI <-function(n,r) {
f1=qf(1-alpha/2, 2*r, 2*(n-r+1), lower.tail=FALSE)
f2=qf(alpha/2, 2*(r+1), 2*(n-r), lower.tail=FALSE)
pl=(1+(n-r+1)/(r*f1))^(-1)
pu=(1+(n-r)/((r+1)*f2))^(-1)
}

首先计算每个大小箱中的物种数量,然后使用下面的代码计算相对比例:

library(dplyr)
iris_count <- iris %>% 
group_by(Slen, Species) %>% 
summarise(TotalParticle = n())%>%
mutate(RelAbund = TotalParticle/sum(TotalParticle))

现在我想计算iris_count数据的CI。n是每个bin中的总数,r是CI函数中的RelAbund*n。对于Slen (4,5) setosa,n= 28+3+1,r= 0.875*32.

我如何直接计算这个,而不是手动为每个情况?

iris$Slen <- cut(iris$Sepal.Length, 
breaks = c(4,5,6,8))
alpha <- as.numeric(0.05) 
CI <-function(n,r) {
f1=qf(1-alpha/2, 2*r, 2*(n-r+1), lower.tail=FALSE)
f2=qf(alpha/2, 2*(r+1), 2*(n-r), lower.tail=FALSE)
pl=(1+(n-r+1)/(r*f1))^(-1)
pu=(1+(n-r)/((r+1)*f2))^(-1)
c(pl, pu) # note how I changed your function here so it returns the output you're looking for
}
library(dplyr)
iris_count <- iris %>% 
group_by(Slen, Species) %>% 
summarise(TotalParticle = n())%>%
mutate(RelAbund = TotalParticle/sum(TotalParticle))
totals <- aggregate(iris_count$TotalParticle, list(iris_count$Slen), sum)
colnames(totals)[which(colnames(totals) == 'Group.1')] <- 'Slen'
new_df <- as.data.frame(merge(iris_count, totals, 'Slen'))
new_df$x <- as.numeric(new_df$x)
colnames(new_df)
confint <- list(NULL)
for (i in seq_len(nrow(new_df))) {
confint[[i]] <- CI(new_df[i, which(colnames(new_df) == "x")], new_df[i, which(colnames(new_df) == "RelAbund")] * new_df[i, which(colnames(new_df) == "x")])
}
new_df$lower_CI <- sapply(confint, function (x) {
x[1]
})
new_df$upper_CI <- sapply(confint, function (x) {
x[2]
})
new_df
#    Slen    Species TotalParticle  RelAbund  x     lower_CI  upper_CI
# 1 (4,5]     setosa            28 0.8750000 32 0.7100515798 0.9648693
# 2 (4,5] versicolor             3 0.0937500 32 0.0197671802 0.2502270
# 3 (4,5]  virginica             1 0.0312500 32 0.0007908686 0.1621710
# 4 (5,6]     setosa            22 0.3859649 57 0.2599546810 0.5242516
# 5 (5,6] versicolor            27 0.4736842 57 0.3398483226 0.6103478
# 6 (5,6]  virginica             8 0.1403509 57 0.0625948901 0.2579454
# 7 (6,8] versicolor            20 0.3278689 61 0.2130663358 0.4600191
# 8 (6,8]  virginica            41 0.6721311 61 0.5399808516 0.7869337

你可能是在重新发明轮子。内置函数prop.test将为您提供二项置信区间。这是一个完整的表达式:

library(tidyverse)
iris$Slen <- cut(iris$Sepal.Length, 
breaks = c(4,5,6,8))
iris %>% 
group_by(Slen, Species) %>% 
summarise(TotalParticle = n()) %>%
mutate(RelAbund = TotalParticle/sum(TotalParticle),
lower = sapply(TotalParticle, 
function(x) prop.test(x, sum(TotalParticle))$conf.int[1]),
upper = sapply(TotalParticle, 
function(x) prop.test(x, sum(TotalParticle))$conf.int[2]))
#> `summarise()` has grouped output by 'Slen'. You can override using the
#> `.groups` argument.
#> # A tibble: 8 x 6
#> # Groups:   Slen [3]
#>   Slen  Species    TotalParticle RelAbund   lower upper
#>   <fct> <fct>              <int>    <dbl>   <dbl> <dbl>
#> 1 (4,5] setosa                28   0.875  0.701   0.959
#> 2 (4,5] versicolor             3   0.0938 0.0245  0.262
#> 3 (4,5] virginica              1   0.0312 0.00163 0.180
#> 4 (5,6] setosa                22   0.386  0.263   0.524
#> 5 (5,6] versicolor            27   0.474  0.342   0.609
#> 6 (5,6] virginica              8   0.140  0.0668  0.263
#> 7 (6,8] versicolor            20   0.328  0.216   0.461
#> 8 (6,8] virginica             41   0.672  0.539   0.784

由reprex包(v2.0.1)创建于2022-06-20

最新更新