r语言 - 为函数调用查找物种特定系数的有效方法



Andrew Robinson在irebreakeR中展示了如何使用直径和高度计算树木体积。他创建了一个函数,该函数使用取决于物种直径的系数。简化版本如下所示:

funRobinson <- function(species, diameter, height) {
bf_params <- data.frame(species  = c("Spruce", "Oak"),
b0_small = c(26.729,  29.790),
b1_small = c( 0.01189, 0.00997),
b0_large = c(32.516,  85.150),
b1_large = c( 0.01181, 0.00841))
dimensions <- data.frame(diameter   = diameter,
height     = height,
species    = as.character(species),
this_order = 1:length(species))
dimensions <- merge(y=dimensions, x=bf_params, all.y=TRUE, all.x=FALSE)
dimensions <- dimensions[order(dimensions$this_order, decreasing=FALSE),]
b0 <- with(dimensions, ifelse(diameter <= 20.5, b0_small, b0_large))
b1 <- with(dimensions, ifelse(diameter <= 20.5, b1_small, b1_large))
b0 + b1 * dimensions$diameter^2 * dimensions$height
}

对我来说,这种方法看起来很简单,但它创建了一个额外的data.frame需要排序调用ifelse两次以区分小树(diameter <= 20.5(和大树。我正在寻找一种更有效的方法(低内存消耗执行时间(来查找物种特定系数。我希望可以在不编辑函数的情况下为其他物种添加系数。

示例数据集和性能:

dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir"),
diameter = c(4,   30,  4,   30,  30),
height  = c(30,  100, 30,  100, 100))
with(dat, funRobinson(species, diameter, height))
#[1]   32.4362 1095.4160   34.5756  842.0500        NA
library(microbenchmark)
microbenchmark(
Robinson = with(dat, funRobinson(species, diameter, height))
)
#Unit: milliseconds
#     expr      min       lq     mean   median       uq      max neval
# Robinson 1.832604 1.860334 1.948054 1.876155 1.905009 3.054021   100

set.seed(0)
size <- 1e5
dat2 <- data.frame(species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height  = runif(size, 1, 100))
microbenchmark(
Robinson = with(dat2, funRobinson(species, diameter, height))
)
#Unit: milliseconds
#     expr      min       lq     mean   median       uq      max neval
# Robinson 203.8171 219.9265 234.0798 227.5911 250.6204 278.9918   100

我想它避免了数据框,而是直接从向量(或矩阵(中调用值。并且 b0 和 b1 调用的值相同,因此我们只需要计算一次。

下面是一个快速尝试,很可能可以做得更快。我基本上为每个参数制作 2 个矩阵,并根据

f2 <- function(species, diameter, height) {
species_avail=c("Spruce", "Oak")
params_b0 = cbind(b0_small = c(26.729,  29.790),
b0_large = c(32.516,  85.150))
rownames(params_b0) = species_avail
params_b1 = cbind(b1_small = c( 0.01189, 0.00997),
b1_large = c( 0.01181, 0.00841))
rownames(params_b1) = species_avail
ROWS = match(species,species_avail)
COLS = +(diameter > 20.5) + 1
idx = cbind(ROWS,COLS)
b0 <- params_b0[idx]
b1 <- params_b1[idx]
b0 + b1 * diameter^2 * height
}

创建数据:

set.seed(0)
size <- 1e5
dat2 <- data.frame(species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height  = runif(size, 1, 100))

检查函数返回相同的内容:

identical(
with(dat2,funRobinson(species, diameter, height)),
with(dat2,f2(species,diameter,height))
)
[1] TRUE

测试:

microbenchmark(
Robinson = with(dat2, funRobinson(species, diameter, height)),
f2 = with(dat2, f2(species, diameter, height))
)
Unit: milliseconds
expr        min        lq      mean    median        uq       max neval
Robinson 249.677157 275.23375 303.97532 298.72475 329.04318 391.53807   100
f2   9.423471  10.16365  13.86918  10.48073  16.06827  65.19541   100
cld
b
a 

使用与@StupidWolf相同的方法,但通过存储按这些因素排序的系数,直接使用树种factor数来消除match。将系数存储在环境中可避免在每次调用函数时设置系数。

funGKiCl  <- function(params, speciesLevels) {
force(params)
force(speciesLevels)
nSpecies <- length(speciesLevels)
i <- match(speciesLevels, params$species)
params_b0  <- c(params$b0_small[i], params$b0_large[i])
params_b1  <- c(params$b1_small[i], params$b1_large[i])
rm(i, params, speciesLevels)
function(species, diameter, height) {
i <- unclass(species) + (diameter > 20.5) * nSpecies
params_b0[i] + params_b1[i] * diameter * diameter * height
}
}
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir")
, diameter = c(4,   30,  4,   30,  30)
, height  = c(30,  100, 30,  100, 100))
params <- read.table(header = TRUE, text = "
species b0_small b1_small b0_large b1_large
Spruce    26.729  0.01189   32.516  0.01181
Oak       29.790  0.00997   85.150  0.00841")
funGKi <- compiler::cmpfun(funGKiCl(params, levels(dat$species)))
with(dat, funGKi(species, diameter, height))
#[1]   32.4362 1095.4160   34.5756  842.0500        NA

目前 @GKi 中的方法速度最快,使用内存最少

数据:

dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir")
, diameter = c(4,   30,  4,   30,  30)
, height  = c(30,  100, 30,  100, 100))
set.seed(0)
size <- 1e5
dat2 <- data.frame(
species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height  = runif(size, 1, 100))

方法:

funRobinson <- function(species, diameter, height) {
bf_params <- data.frame(species  = c("Spruce", "Oak"),
b0_small = c(26.729,  29.790),
b1_small = c( 0.01189, 0.00997),
b0_large = c(32.516,  85.150),
b1_large = c( 0.01181, 0.00841))
dimensions <- data.frame(diameter   = diameter,
height     = height,
species    = as.character(species),
this_order = 1:length(species))
dimensions <- merge(y=dimensions, x=bf_params, all.y=TRUE, all.x=FALSE)
dimensions <- dimensions[order(dimensions$this_order, decreasing=FALSE),]
b0 <- with(dimensions, ifelse(diameter <= 20.5, b0_small, b0_large))
b1 <- with(dimensions, ifelse(diameter <= 20.5, b1_small, b1_large))
b0 + b1 * dimensions$diameter^2 * dimensions$height
}
with(dat, funRobinson(species, diameter, height))
funStupidWolf <- function(species, diameter, height) {
species_avail=c("Spruce", "Oak")
params_b0 = cbind(b0_small = c(26.729,  29.790),
b0_large = c(32.516,  85.150))
rownames(params_b0) = species_avail
params_b1 = cbind(b1_small = c( 0.01189, 0.00997),
b1_large = c( 0.01181, 0.00841))
rownames(params_b1) = species_avail
ROWS = match(species,species_avail)
COLS = +(diameter > 20.5) + 1
idx = cbind(ROWS,COLS)
b0 <- params_b0[idx]
b1 <- params_b1[idx]
b0 + b1 * diameter^2 * height
}
with(dat, funStupidWolf(species, diameter, height))
funGKiCl  <- function(params, speciesLevels) {
force(params)
force(speciesLevels)
nSpecies <- length(speciesLevels)
i <- match(speciesLevels, params$species)
params_b0  <- c(params$b0_small[i], params$b0_large[i])
params_b1  <- c(params$b1_small[i], params$b1_large[i])
rm(i, params, speciesLevels)
function(species, diameter, height) {
i <- unclass(species) + (diameter > 20.5) * nSpecies
params_b0[i] + params_b1[i] * diameter * diameter * height
}
}
params <- read.table(header = TRUE, text = "
species b0_small b1_small b0_large b1_large
Spruce    26.729  0.01189   32.516  0.01181
Oak       29.790  0.00997   85.150  0.00841")
funGKi <- compiler::cmpfun(funGKiCl(params, levels(dat$species)))
with(dat, funGKi(species, diameter, height))
rm(funGKiCl, params)
fun <- alist(Robinson = funRobinson(species, diameter, height)
, StupidWolf = funStupidWolf(species, diameter, height)
, GKi = funGKi(species, diameter, height))

时间:

library(microbenchmark)
attach(dat)
microbenchmark(list = fun, check = "equal")
#Unit: microseconds
#       expr      min       lq       mean    median        uq      max neval
#   Robinson 1876.491 1911.583 1997.00924 1934.8835 1962.3145 3131.453   100
# StupidWolf   15.618   17.371   22.30764   18.9995   26.5125   33.239   100
#        GKi    2.270    2.965    4.04041    3.6825    5.0415    7.434   100
microbenchmark(list = fun, check = "equal", control=list(order="block"))
#Unit: microseconds
#       expr      min        lq       mean   median        uq      max neval
#   Robinson 1887.906 1918.0475 2000.55586 1938.847 1962.9540 3131.112   100
# StupidWolf   15.184   16.2775   16.97111   16.668   17.2230   34.646   100
#        GKi    2.063    2.1560    2.37552    2.255    2.4015    5.616   100
attach(dat2)
microbenchmark(list = fun, setup = gc(), check = "equal")
#Unit: milliseconds
#       expr        min         lq       mean     median         uq        max neval
#   Robinson 189.342408 193.222831 193.682868 193.573419 194.181910 198.231698   100
# StupidWolf   6.755601   6.786439   6.836253   6.804451   6.832409   7.370937   100
#        GKi   1.756241   1.767335   1.794328   1.782949   1.806370   1.964409   100

library(bench)
attach(dat)
mark(exprs = fun, iterations = 100)
#  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
#1 Robinson    1.87ms   1.9ms      521.        0B     10.6    98     2    188.1ms
#2 StupidWolf 16.48µs 17.46µs    55666.        0B      0     100     0      1.8ms
#3 GKi         2.67µs  2.86µs   337265.        0B      0     100     0    296.5µs
attach(dat2)
mark(exprs = fun, iterations = 100)
#  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
#1 Robinson   188.96ms 216.15ms      4.44   67.71MB     11.4   100   257
#2 StupidWolf   6.79ms   6.85ms    131.      8.77MB     30.0   100    23
#3 GKi           1.7ms   1.72ms    552.      2.29MB     22.1   100     4
#Some expressions had a GC in every iteration; so filtering is disabled. 

记忆:

memUse <- function(list, setup = "", gctort = FALSE) {
as.data.frame(lapply(list, function(z) {
eval(setup)
tt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
gctorture(on = gctort)
eval(z)
gctorture(on = FALSE)
sum(.Internal(gc(FALSE, FALSE, TRUE))[13:14]) - tt
}))
}
attach(dat)
memUse(list=fun, gctort = FALSE)
# Robinson StupidWolf GKi
#1     0.9          0   0
memUse(list=fun, gctort = TRUE)
# Robinson StupidWolf GKi
#1       0          0   0
attach(dat2)
memUse(list=fun, gctort = FALSE)
# Robinson StupidWolf GKi
#1    71.9        8.8 2.3
memUse(list=fun, gctort = TRUE)
# Robinson StupidWolf GKi
#1    29.7        6.5 2.3
object.size(funRobinson)
#109784 bytes
object.size(funStupidWolf)
#68240 bytes
object.size(funGKi)
#21976 bytes

最新更新