r-使用lapply使引导功能更加高效



我有一个带有数字列的数据框架和一个带有标签的字符列。参见示例:

library(tidyverse)
a <- c(0.036210845, 0.005546561, 0.004394322 ,0.006635205, 2.269306824 ,0.013542101, 0.006580308 ,0.006854309,0.009076331 ,0.006577178 ,0.099406840 ,0.010962796, 0.011491922,0.007454443 ,0.004463684,0.005836916,0.011119906 ,0.009543205, 0.003990476, 0.007793532 ,0.020776231, 0.011713687, 0.010045341, 0.008411304, 0.032514994)
b <- c(0.030677829, 0.005210211, 0.004164294, 0.006279456 ,1.095908581 ,0.012029876, 0.006193405 ,0.006486812, 0.008589699, 0.006167356, 0.068956516 ,0.010140064 ,0.010602171 ,0.006898081 ,0.004193735, 0.005447855 ,0.009936211, 0.008743681, 0.003774822, 0.007375678, 0.019695336, 0.010827791, 0.009258572, 0.007960328,0.026956408)
c <- c(0.025855453, 0.004882746 ,0.003946182, 0.005929399 ,0.466284591 ,0.010704604 ,0.005815709, 0.006125196, 0.008110854, 0.005769223, 0.046847336, 0.009356712, 0.009803620 ,0.006366758, 0.003936953 ,0.005072295, 0.008885989 ,0.007989028, 0.003565631, 0.006964512, 0.018636187, 0.010009413, 0.008540876, 0.007516569,0.022227924)
label <- c("fa05","fa05" ,"fa05", "fa10", "fa10",  "fa10", "fa20","fa20", "faflat", "faflat", "sa05", "sa05", "sa10" ,  "sa10" , "sa10" , "sa10", "sa10", "sa10", "sa20", "sa20", "sa20" ,"sa20", "saflat", "saflat", "saflat")
dataframe <- as.data.frame(cbind(a,b,c,label))
dataframe <- dataframe %>%
  transform(a = as.numeric(a)) %>%
  transform(b = as.numeric(b)) %>%
  transform(c = as.numeric(c))

我已经编写了一个函数,它为每个标签获取一个行样本(样本中的行数=特定标签的行数(,并作为输出给出样本的平均值。示例:在源数据(数据帧(中,有3行标签"0";fa05";。让我们称它们为fa05_1,fa05_2,fa05_3(只是为了解释它(。该函数从这三行中抽取一个样本,每行由3列(a、b和c(组成。样本中fa05的数量等于源数据中的fa05数量,因此在本例中为3。该函数获取一个带有替换的样本,因此它可以fx为fa05_3、fa05_1、fa05_1。然后,它为a、b和c三列中的每一列取这三个样本的平均值,并给出输出。它看起来像这样:

samp <- function(df, col1, var){
  df %>% 
    group_by(!!col1) %>% 
    nest() %>%
    ungroup() %>% 
    mutate(n = !!var) %>%
    mutate(samp = map2(data, n, sample_n, replace=T)) %>%
    select(-data) %>%
    unnest(samp) %>%
    group_by(!!col1) %>%
    dplyr::summarise(across("a":"c", mean))
}
list <- c(3,3,2,2,2,6,4,3) # The number of times each label occur in the data 
samp(dataframe, quo(label), quo(list))
  label        a       b       c
  <chr>    <dbl>   <dbl>   <dbl>
1 fa05   0.00439 0.00416 0.00395
2 fa10   0.00894 0.00820 0.00752
3 fa20   0.00672 0.00634 0.00597
4 faflat 0.00908 0.00859 0.00811
5 sa05   0.0552  0.0395  0.0281 
6 sa10   0.00715 0.00657 0.00603
7 sa20   0.0101  0.00956 0.00903
8 saflat 0.0250  0.0211  0.0177 

我想在一些数据上使用这个函数,并有效地重复它1000次。起初它不是一个函数,我使用了rerun((,但效率很低。我读到我可以把它写成一个函数,并使用lapply,这应该更有效,但当我这样做时,它不起作用:

lapply(dataframe, samp, col1=quo(Pattern), var=quo(list))
Error in UseMethod("group_by_") : 
  no applicable method for 'group_by_' applied to an object of class "c('double', 'numeric')"

我该如何使用lapply?我该如何告诉lapply重新运行该函数1000次?我希望你能帮忙。

您可以直接执行

replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE)

然而,这真的很慢。

> system.time(replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE))
   user  system elapsed 
  33.83    0.03   33.87

为了更快,我们需要重写您的samp函数。以下是tidyverse方法

group_sample_size <- c("fa05" = 3, "fa10" = 3, "fa20" = 2, "faflat" = 2, "sa05" = 2, "sa10" = 6, "sa20" = 4, "saflat" = 3)
prep <- function(df, grp_var, sample_size) {
  df %>% 
    mutate(size = sample_size[.data[[grp_var]]]) %>% 
    group_by(across(!!grp_var))
}
rep_sample <- function(df, n) {
  replicate(
    n,
    df %>% 
      slice(sample.int(n(), size[[1L]], replace = TRUE)) %>% 
      summarise(across(a:c, mean), .groups = "drop"), 
    simplify = FALSE
  )
}
dataframe %>% 
  prep("label", group_sample_size) %>% 
  rep_sample(1000)

性能有了显著提高,但仍然是次优IMO。完成模拟大约需要5-6秒。

> system.time(dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000))
   user  system elapsed 
   5.80    0.01    5.81 

为了提高效率,我认为下面的data.table方法会更好。

library(data.table)
fsamp <- function(df, grp_var, size, nsim) {
  df <- as.data.table(df)
  group_info <- table(df[[grp_var]], dnn = list(grp_var))
  simu_pool <- df[, -grp_var, with = FALSE]
  simu_vars <- names(simu_pool)
  simu_pool <- split(simu_pool, df[[grp_var]])
  
  out <- data.table(
    simu = rep(seq_len(nsim), each = length(group_info)), 
    group_info
  )
  
  out[
    , size := size[out[[grp_var]]]
  ][
    , (simu_vars) := lapply(simu_pool[[.BY[[grp_var]]]][sample.int(N, size, replace = TRUE)], mean),
    by = c("simu", grp_var)
  ][]
}

这一方法比优化的tidyverse方法快大约四倍。

> system.time(fsamp(dataframe, "label", group_sample_size, 1000))
   user  system elapsed 
   1.47    0.04    1.50

所有三种方法都产生相同的结果集

> set.seed(124)
> # rbindlist converts a list of tibbles into a single data.table
> dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000) %>% rbindlist()
       label           a           b           c
   1:   fa05 0.015383909 0.013350778 0.011561460
   2:   fa10 0.763161377 0.371405971 0.160972865
   3:   fa20 0.006717308 0.006340109 0.005970452
   4: faflat 0.009076331 0.008589699 0.008110854
   5:   sa05 0.055184818 0.039548290 0.028102024
  ---                                           
7996: faflat 0.007826754 0.007378527 0.006940039
7997:   sa05 0.099406840 0.068956516 0.046847336
7998:   sa10 0.006648513 0.006118159 0.005626362
7999:   sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569
> set.seed(124)
> fsamp(df, "label", group_sample_size, 1000)
      simu  label N size           a           b           c
   1:    1   fa05 3    3 0.015383909 0.013350778 0.011561460
   2:    1   fa10 3    3 0.763161377 0.371405971 0.160972865
   3:    1   fa20 2    2 0.006717308 0.006340109 0.005970452
   4:    1 faflat 2    2 0.009076331 0.008589699 0.008110854
   5:    1   sa05 2    2 0.055184818 0.039548290 0.028102024
  ---                                                       
7996: 1000 faflat 2    2 0.007826754 0.007378527 0.006940039
7997: 1000   sa05 2    2 0.099406840 0.068956516 0.046847336
7998: 1000   sa10 6    6 0.006648513 0.006118159 0.005626362
7999: 1000   sa20 4    4 0.020776231 0.019695336 0.018636187
8000: 1000 saflat 3    3 0.008411304 0.007960328 0.007516569
> set.seed(124)
> replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE) %>% rbindlist()
       label           a           b           c
   1:   fa05 0.015383909 0.013350778 0.011561460
   2:   fa10 0.763161377 0.371405971 0.160972865
   3:   fa20 0.006717308 0.006340109 0.005970452
   4: faflat 0.009076331 0.008589699 0.008110854
   5:   sa05 0.055184818 0.039548290 0.028102024
  ---                                           
7996: faflat 0.007826754 0.007378527 0.006940039
7997:   sa05 0.099406840 0.068956516 0.046847336
7998:   sa10 0.006648513 0.006118159 0.005626362
7999:   sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569

最新更新