Andrew Robinson在irebreakeR中展示了如何使用直径和高度计算树木体积。他创建了一个函数,该函数使用取决于物种和直径的系数。简化版本如下所示:
funRobinson <- function(species, diameter, height) {
bf_params <- data.frame(species = c("Spruce", "Oak"),
b0_small = c(26.729, 29.790),
b1_small = c( 0.01189, 0.00997),
b0_large = c(32.516, 85.150),
b1_large = c( 0.01181, 0.00841))
dimensions <- data.frame(diameter = diameter,
height = height,
species = as.character(species),
this_order = 1:length(species))
dimensions <- merge(y=dimensions, x=bf_params, all.y=TRUE, all.x=FALSE)
dimensions <- dimensions[order(dimensions$this_order, decreasing=FALSE),]
b0 <- with(dimensions, ifelse(diameter <= 20.5, b0_small, b0_large))
b1 <- with(dimensions, ifelse(diameter <= 20.5, b1_small, b1_large))
b0 + b1 * dimensions$diameter^2 * dimensions$height
}
对我来说,这种方法看起来很简单,但它创建了一个额外的data.frame
,需要排序并调用ifelse
两次以区分小树(diameter <= 20.5
(和大树。我正在寻找一种更有效的方法(低内存消耗、执行时间(来查找物种特定系数。我希望可以在不编辑函数的情况下为其他物种添加系数。
示例数据集和性能:
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir"),
diameter = c(4, 30, 4, 30, 30),
height = c(30, 100, 30, 100, 100))
with(dat, funRobinson(species, diameter, height))
#[1] 32.4362 1095.4160 34.5756 842.0500 NA
library(microbenchmark)
microbenchmark(
Robinson = with(dat, funRobinson(species, diameter, height))
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# Robinson 1.832604 1.860334 1.948054 1.876155 1.905009 3.054021 100
set.seed(0)
size <- 1e5
dat2 <- data.frame(species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height = runif(size, 1, 100))
microbenchmark(
Robinson = with(dat2, funRobinson(species, diameter, height))
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# Robinson 203.8171 219.9265 234.0798 227.5911 250.6204 278.9918 100
我想它避免了数据框,而是直接从向量(或矩阵(中调用值。并且 b0 和 b1 调用的值相同,因此我们只需要计算一次。
下面是一个快速尝试,很可能可以做得更快。我基本上为每个参数制作 2 个矩阵,并根据
f2 <- function(species, diameter, height) {
species_avail=c("Spruce", "Oak")
params_b0 = cbind(b0_small = c(26.729, 29.790),
b0_large = c(32.516, 85.150))
rownames(params_b0) = species_avail
params_b1 = cbind(b1_small = c( 0.01189, 0.00997),
b1_large = c( 0.01181, 0.00841))
rownames(params_b1) = species_avail
ROWS = match(species,species_avail)
COLS = +(diameter > 20.5) + 1
idx = cbind(ROWS,COLS)
b0 <- params_b0[idx]
b1 <- params_b1[idx]
b0 + b1 * diameter^2 * height
}
创建数据:
set.seed(0)
size <- 1e5
dat2 <- data.frame(species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height = runif(size, 1, 100))
检查函数返回相同的内容:
identical(
with(dat2,funRobinson(species, diameter, height)),
with(dat2,f2(species,diameter,height))
)
[1] TRUE
测试:
microbenchmark(
Robinson = with(dat2, funRobinson(species, diameter, height)),
f2 = with(dat2, f2(species, diameter, height))
)
Unit: milliseconds
expr min lq mean median uq max neval
Robinson 249.677157 275.23375 303.97532 298.72475 329.04318 391.53807 100
f2 9.423471 10.16365 13.86918 10.48073 16.06827 65.19541 100
cld
b
a
使用与@StupidWolf相同的方法,但通过存储按这些因素排序的系数,直接使用树种factor
数来消除match
。将系数存储在环境中可避免在每次调用函数时设置系数。
funGKiCl <- function(params, speciesLevels) {
force(params)
force(speciesLevels)
nSpecies <- length(speciesLevels)
i <- match(speciesLevels, params$species)
params_b0 <- c(params$b0_small[i], params$b0_large[i])
params_b1 <- c(params$b1_small[i], params$b1_large[i])
rm(i, params, speciesLevels)
function(species, diameter, height) {
i <- unclass(species) + (diameter > 20.5) * nSpecies
params_b0[i] + params_b1[i] * diameter * diameter * height
}
}
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir")
, diameter = c(4, 30, 4, 30, 30)
, height = c(30, 100, 30, 100, 100))
params <- read.table(header = TRUE, text = "
species b0_small b1_small b0_large b1_large
Spruce 26.729 0.01189 32.516 0.01181
Oak 29.790 0.00997 85.150 0.00841")
funGKi <- compiler::cmpfun(funGKiCl(params, levels(dat$species)))
with(dat, funGKi(species, diameter, height))
#[1] 32.4362 1095.4160 34.5756 842.0500 NA
目前 @GKi 中的方法速度最快,使用内存最少。
数据:
dat <- data.frame(species = c("Spruce", "Spruce", "Oak", "Oak", "Fir")
, diameter = c(4, 30, 4, 30, 30)
, height = c(30, 100, 30, 100, 100))
set.seed(0)
size <- 1e5
dat2 <- data.frame(
species = sample(c("Spruce", "Oak", "Fir"), size=size, replace = TRUE)
, diameter = runif(size, 1, 50)
, height = runif(size, 1, 100))
方法:
funRobinson <- function(species, diameter, height) {
bf_params <- data.frame(species = c("Spruce", "Oak"),
b0_small = c(26.729, 29.790),
b1_small = c( 0.01189, 0.00997),
b0_large = c(32.516, 85.150),
b1_large = c( 0.01181, 0.00841))
dimensions <- data.frame(diameter = diameter,
height = height,
species = as.character(species),
this_order = 1:length(species))
dimensions <- merge(y=dimensions, x=bf_params, all.y=TRUE, all.x=FALSE)
dimensions <- dimensions[order(dimensions$this_order, decreasing=FALSE),]
b0 <- with(dimensions, ifelse(diameter <= 20.5, b0_small, b0_large))
b1 <- with(dimensions, ifelse(diameter <= 20.5, b1_small, b1_large))
b0 + b1 * dimensions$diameter^2 * dimensions$height
}
with(dat, funRobinson(species, diameter, height))
funStupidWolf <- function(species, diameter, height) {
species_avail=c("Spruce", "Oak")
params_b0 = cbind(b0_small = c(26.729, 29.790),
b0_large = c(32.516, 85.150))
rownames(params_b0) = species_avail
params_b1 = cbind(b1_small = c( 0.01189, 0.00997),
b1_large = c( 0.01181, 0.00841))
rownames(params_b1) = species_avail
ROWS = match(species,species_avail)
COLS = +(diameter > 20.5) + 1
idx = cbind(ROWS,COLS)
b0 <- params_b0[idx]
b1 <- params_b1[idx]
b0 + b1 * diameter^2 * height
}
with(dat, funStupidWolf(species, diameter, height))
funGKiCl <- function(params, speciesLevels) {
force(params)
force(speciesLevels)
nSpecies <- length(speciesLevels)
i <- match(speciesLevels, params$species)
params_b0 <- c(params$b0_small[i], params$b0_large[i])
params_b1 <- c(params$b1_small[i], params$b1_large[i])
rm(i, params, speciesLevels)
function(species, diameter, height) {
i <- unclass(species) + (diameter > 20.5) * nSpecies
params_b0[i] + params_b1[i] * diameter * diameter * height
}
}
params <- read.table(header = TRUE, text = "
species b0_small b1_small b0_large b1_large
Spruce 26.729 0.01189 32.516 0.01181
Oak 29.790 0.00997 85.150 0.00841")
funGKi <- compiler::cmpfun(funGKiCl(params, levels(dat$species)))
with(dat, funGKi(species, diameter, height))
rm(funGKiCl, params)
fun <- alist(Robinson = funRobinson(species, diameter, height)
, StupidWolf = funStupidWolf(species, diameter, height)
, GKi = funGKi(species, diameter, height))
时间:
library(microbenchmark)
attach(dat)
microbenchmark(list = fun, check = "equal")
#Unit: microseconds
# expr min lq mean median uq max neval
# Robinson 1876.491 1911.583 1997.00924 1934.8835 1962.3145 3131.453 100
# StupidWolf 15.618 17.371 22.30764 18.9995 26.5125 33.239 100
# GKi 2.270 2.965 4.04041 3.6825 5.0415 7.434 100
microbenchmark(list = fun, check = "equal", control=list(order="block"))
#Unit: microseconds
# expr min lq mean median uq max neval
# Robinson 1887.906 1918.0475 2000.55586 1938.847 1962.9540 3131.112 100
# StupidWolf 15.184 16.2775 16.97111 16.668 17.2230 34.646 100
# GKi 2.063 2.1560 2.37552 2.255 2.4015 5.616 100
attach(dat2)
microbenchmark(list = fun, setup = gc(), check = "equal")
#Unit: milliseconds
# expr min lq mean median uq max neval
# Robinson 189.342408 193.222831 193.682868 193.573419 194.181910 198.231698 100
# StupidWolf 6.755601 6.786439 6.836253 6.804451 6.832409 7.370937 100
# GKi 1.756241 1.767335 1.794328 1.782949 1.806370 1.964409 100
library(bench)
attach(dat)
mark(exprs = fun, iterations = 100)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#1 Robinson 1.87ms 1.9ms 521. 0B 10.6 98 2 188.1ms
#2 StupidWolf 16.48µs 17.46µs 55666. 0B 0 100 0 1.8ms
#3 GKi 2.67µs 2.86µs 337265. 0B 0 100 0 296.5µs
attach(dat2)
mark(exprs = fun, iterations = 100)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 Robinson 188.96ms 216.15ms 4.44 67.71MB 11.4 100 257
#2 StupidWolf 6.79ms 6.85ms 131. 8.77MB 30.0 100 23
#3 GKi 1.7ms 1.72ms 552. 2.29MB 22.1 100 4
#Some expressions had a GC in every iteration; so filtering is disabled.
记忆:
memUse <- function(list, setup = "", gctort = FALSE) {
as.data.frame(lapply(list, function(z) {
eval(setup)
tt <- sum(.Internal(gc(FALSE, TRUE, TRUE))[13:14])
gctorture(on = gctort)
eval(z)
gctorture(on = FALSE)
sum(.Internal(gc(FALSE, FALSE, TRUE))[13:14]) - tt
}))
}
attach(dat)
memUse(list=fun, gctort = FALSE)
# Robinson StupidWolf GKi
#1 0.9 0 0
memUse(list=fun, gctort = TRUE)
# Robinson StupidWolf GKi
#1 0 0 0
attach(dat2)
memUse(list=fun, gctort = FALSE)
# Robinson StupidWolf GKi
#1 71.9 8.8 2.3
memUse(list=fun, gctort = TRUE)
# Robinson StupidWolf GKi
#1 29.7 6.5 2.3
object.size(funRobinson)
#109784 bytes
object.size(funStupidWolf)
#68240 bytes
object.size(funGKi)
#21976 bytes