R - 提高采样功能速度



>假设我有以下数据结构:

sl_sev_disbn <- data.frame("lower_band" = c(0,10e6,20e6,30e6,0,0,0),
"upper_band" = c(10e6,20e6,30e6,40e6,0,0,0),
"prob" = c(0.56521739,0.34782609,0.08212560,0.00483092,0,0,0),
"band" = c(1,2,3,4,5,6,7))

我想使用给定的概率作为权重从该数据帧中随机采样"波段"。从这个波段,我想从给定的下部和上部波段均匀地采样一个随机数,并存储每个样本并将其返回到向量中。我有以下代码:

rsuper_lrg_disbn <- function(n = 1,df){
band_sample <- sample(x = df$band, size = n, prob = df$prob,replace=TRUE)
vals <- c()
for (band in band_sample){
filt_df <- df[df$band == band,] #filter to randomly selected band
loss <- runif(1,min=filt_df$lower_band,max=filt_df$upper_band)
vals <- c(vals,loss)
}
return(vals)
}

然后使用它就像:rsuper_lrg_disbn(n=2,sl_sev_disbn)

但是,如果我使用 n 非常大(例如 n = 1e6(,这段代码会大大减慢。

有谁知道我怎样才能加快速度?

利用runif被矢量化的事实!

sl_sev_disbn <- data.frame("lower_band" = c(0,10e6,20e6,30e6,0,0,0),
"upper_band" = c(10e6,20e6,30e6,40e6,0,0,0),
"prob" = c(0.56521739,0.34782609,0.08212560,0.00483092,0,0,0),
"band" = c(1,2,3,4,5,6,7))
rsuper_lrg_disbn <- function(n = 1,df){
band_sample <- sample(x = df$band, size = n, prob = df$prob,replace=TRUE)
vals <- c()
for (band in band_sample){
filt_df <- df[df$band == band,] #filter to randomly selected band
loss <- runif(1,min=filt_df$lower_band,max=filt_df$upper_band)
vals <- c(vals,loss)
}
return(vals)
}
fast_samp <- function(n = 1, df) {
band_sample <- sample(x = df$band, size = n, prob = df$prob,replace=TRUE)
vals <- runif(n, min = df[band_sample, 'lower_band'], max = df[band_sample, 'upper_band'])
return(vals)
}
## same dist
summary(rsuper_lrg_disbn(n = 3e4, sl_sev_disbn))
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>      832  4428211  8903344 10290102 15373486 39992683
summary(fast_samp(n = 3e4, sl_sev_disbn))
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>     2278  4435472  8827781 10261747 15312544 39703908
library(microbenchmark)
microbenchmark(rsuper_lrg_disbn(n = 1e3, sl_sev_disbn),
fast_samp(n = 1e3, sl_sev_disbn))
#> Unit: microseconds
#>                                      expr       min        lq       mean
#>  rsuper_lrg_disbn(n = 1000, sl_sev_disbn) 36032.381 37381.912 38232.6291
#>         fast_samp(n = 1000, sl_sev_disbn)    75.062    79.012   115.8676
#>     median        uq       max neval
#>  37672.677 38327.886 60730.445   100
#>     89.284    92.444  2689.974   100

创建于 2019-10-16 由 reprex 软件包 (v0.3.0(

最新更新