我有一个带有数字列的数据框架和一个带有标签的字符列。参见示例:
library(tidyverse)
a <- c(0.036210845, 0.005546561, 0.004394322 ,0.006635205, 2.269306824 ,0.013542101, 0.006580308 ,0.006854309,0.009076331 ,0.006577178 ,0.099406840 ,0.010962796, 0.011491922,0.007454443 ,0.004463684,0.005836916,0.011119906 ,0.009543205, 0.003990476, 0.007793532 ,0.020776231, 0.011713687, 0.010045341, 0.008411304, 0.032514994)
b <- c(0.030677829, 0.005210211, 0.004164294, 0.006279456 ,1.095908581 ,0.012029876, 0.006193405 ,0.006486812, 0.008589699, 0.006167356, 0.068956516 ,0.010140064 ,0.010602171 ,0.006898081 ,0.004193735, 0.005447855 ,0.009936211, 0.008743681, 0.003774822, 0.007375678, 0.019695336, 0.010827791, 0.009258572, 0.007960328,0.026956408)
c <- c(0.025855453, 0.004882746 ,0.003946182, 0.005929399 ,0.466284591 ,0.010704604 ,0.005815709, 0.006125196, 0.008110854, 0.005769223, 0.046847336, 0.009356712, 0.009803620 ,0.006366758, 0.003936953 ,0.005072295, 0.008885989 ,0.007989028, 0.003565631, 0.006964512, 0.018636187, 0.010009413, 0.008540876, 0.007516569,0.022227924)
label <- c("fa05","fa05" ,"fa05", "fa10", "fa10", "fa10", "fa20","fa20", "faflat", "faflat", "sa05", "sa05", "sa10" , "sa10" , "sa10" , "sa10", "sa10", "sa10", "sa20", "sa20", "sa20" ,"sa20", "saflat", "saflat", "saflat")
dataframe <- as.data.frame(cbind(a,b,c,label))
dataframe <- dataframe %>%
transform(a = as.numeric(a)) %>%
transform(b = as.numeric(b)) %>%
transform(c = as.numeric(c))
我已经编写了一个函数,它为每个标签获取一个行样本(样本中的行数=特定标签的行数(,并作为输出给出样本的平均值。示例:在源数据(数据帧(中,有3行标签"0";fa05";。让我们称它们为fa05_1,fa05_2,fa05_3(只是为了解释它(。该函数从这三行中抽取一个样本,每行由3列(a、b和c(组成。样本中fa05的数量等于源数据中的fa05数量,因此在本例中为3。该函数获取一个带有替换的样本,因此它可以fx为fa05_3、fa05_1、fa05_1。然后,它为a、b和c三列中的每一列取这三个样本的平均值,并给出输出。它看起来像这样:
samp <- function(df, col1, var){
df %>%
group_by(!!col1) %>%
nest() %>%
ungroup() %>%
mutate(n = !!var) %>%
mutate(samp = map2(data, n, sample_n, replace=T)) %>%
select(-data) %>%
unnest(samp) %>%
group_by(!!col1) %>%
dplyr::summarise(across("a":"c", mean))
}
list <- c(3,3,2,2,2,6,4,3) # The number of times each label occur in the data
samp(dataframe, quo(label), quo(list))
label a b c
<chr> <dbl> <dbl> <dbl>
1 fa05 0.00439 0.00416 0.00395
2 fa10 0.00894 0.00820 0.00752
3 fa20 0.00672 0.00634 0.00597
4 faflat 0.00908 0.00859 0.00811
5 sa05 0.0552 0.0395 0.0281
6 sa10 0.00715 0.00657 0.00603
7 sa20 0.0101 0.00956 0.00903
8 saflat 0.0250 0.0211 0.0177
我想在一些数据上使用这个函数,并有效地重复它1000次。起初它不是一个函数,我使用了rerun((,但效率很低。我读到我可以把它写成一个函数,并使用lapply,这应该更有效,但当我这样做时,它不起作用:
lapply(dataframe, samp, col1=quo(Pattern), var=quo(list))
Error in UseMethod("group_by_") :
no applicable method for 'group_by_' applied to an object of class "c('double', 'numeric')"
我该如何使用lapply?我该如何告诉lapply重新运行该函数1000次?我希望你能帮忙。
您可以直接执行
replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE)
然而,这真的很慢。
> system.time(replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE))
user system elapsed
33.83 0.03 33.87
为了更快,我们需要重写您的samp
函数。以下是tidyverse
方法
group_sample_size <- c("fa05" = 3, "fa10" = 3, "fa20" = 2, "faflat" = 2, "sa05" = 2, "sa10" = 6, "sa20" = 4, "saflat" = 3)
prep <- function(df, grp_var, sample_size) {
df %>%
mutate(size = sample_size[.data[[grp_var]]]) %>%
group_by(across(!!grp_var))
}
rep_sample <- function(df, n) {
replicate(
n,
df %>%
slice(sample.int(n(), size[[1L]], replace = TRUE)) %>%
summarise(across(a:c, mean), .groups = "drop"),
simplify = FALSE
)
}
dataframe %>%
prep("label", group_sample_size) %>%
rep_sample(1000)
性能有了显著提高,但仍然是次优IMO。完成模拟大约需要5-6秒。
> system.time(dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000))
user system elapsed
5.80 0.01 5.81
为了提高效率,我认为下面的data.table
方法会更好。
library(data.table)
fsamp <- function(df, grp_var, size, nsim) {
df <- as.data.table(df)
group_info <- table(df[[grp_var]], dnn = list(grp_var))
simu_pool <- df[, -grp_var, with = FALSE]
simu_vars <- names(simu_pool)
simu_pool <- split(simu_pool, df[[grp_var]])
out <- data.table(
simu = rep(seq_len(nsim), each = length(group_info)),
group_info
)
out[
, size := size[out[[grp_var]]]
][
, (simu_vars) := lapply(simu_pool[[.BY[[grp_var]]]][sample.int(N, size, replace = TRUE)], mean),
by = c("simu", grp_var)
][]
}
这一方法比优化的tidyverse
方法快大约四倍。
> system.time(fsamp(dataframe, "label", group_sample_size, 1000))
user system elapsed
1.47 0.04 1.50
所有三种方法都产生相同的结果集
> set.seed(124)
> # rbindlist converts a list of tibbles into a single data.table
> dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000) %>% rbindlist()
label a b c
1: fa05 0.015383909 0.013350778 0.011561460
2: fa10 0.763161377 0.371405971 0.160972865
3: fa20 0.006717308 0.006340109 0.005970452
4: faflat 0.009076331 0.008589699 0.008110854
5: sa05 0.055184818 0.039548290 0.028102024
---
7996: faflat 0.007826754 0.007378527 0.006940039
7997: sa05 0.099406840 0.068956516 0.046847336
7998: sa10 0.006648513 0.006118159 0.005626362
7999: sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569
> set.seed(124)
> fsamp(df, "label", group_sample_size, 1000)
simu label N size a b c
1: 1 fa05 3 3 0.015383909 0.013350778 0.011561460
2: 1 fa10 3 3 0.763161377 0.371405971 0.160972865
3: 1 fa20 2 2 0.006717308 0.006340109 0.005970452
4: 1 faflat 2 2 0.009076331 0.008589699 0.008110854
5: 1 sa05 2 2 0.055184818 0.039548290 0.028102024
---
7996: 1000 faflat 2 2 0.007826754 0.007378527 0.006940039
7997: 1000 sa05 2 2 0.099406840 0.068956516 0.046847336
7998: 1000 sa10 6 6 0.006648513 0.006118159 0.005626362
7999: 1000 sa20 4 4 0.020776231 0.019695336 0.018636187
8000: 1000 saflat 3 3 0.008411304 0.007960328 0.007516569
> set.seed(124)
> replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE) %>% rbindlist()
label a b c
1: fa05 0.015383909 0.013350778 0.011561460
2: fa10 0.763161377 0.371405971 0.160972865
3: fa20 0.006717308 0.006340109 0.005970452
4: faflat 0.009076331 0.008589699 0.008110854
5: sa05 0.055184818 0.039548290 0.028102024
---
7996: faflat 0.007826754 0.007378527 0.006940039
7997: sa05 0.099406840 0.068956516 0.046847336
7998: sa10 0.006648513 0.006118159 0.005626362
7999: sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569