我有一个矩阵(mat_cdf
(,表示人口普查区i
中的个人在给定日期移动到人口普查区j
的累积概率。给定一个决定不";"待在家里";,我有一个函数,下面的GetCTMove
函数,从这个矩阵中随机抽样,以确定他们将在哪个人口普查区度过时间。
# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
# Matrix where rows correspond to probability mass of column integer
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
# Convert to cumulative probability mass
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
# Create cdf for row i
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
GetCTMove <- function(agent_cts, ct_mat_cdf){
# Expand such that every agent has its own row corresponding to CDF of movement from their home ct i to j
mat_expand <- ct_mat_cdf[agent_cts,]
# Probabilistically sample column index for every row by generating random number and then determining corresponding closest column
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
# Sample of 500,000 agents' residence ct
agents <- sample(1:cts, size = 500000, replace = T)
# Run function
system.time(GetCTMove(agents, mat_cdf))
user system elapsed
3.09 1.19 4.30
使用100万个代理,每个样本运行大约需要10秒,乘以许多时间步长,每次模拟需要数小时,而这个函数是模型的速率限制因素。我想知道是否有人对更快地实现这种随机采样有建议。我已经使用了dqrng
包来加快随机数的生成,但与运行时间最长的矩阵展开(mat_expand
(和max.col
调用相比,这相对较小。
您可以优化的第一件事是以下代码:
max.col(s < mat_expand, "first")
由于s < mat_expand
返回一个逻辑矩阵,因此应用max.col
函数与获得每行中的第一个TRUE
相同。在这种情况下,使用which
将更加有效。此外,如下所示,您将所有CDF存储在一个矩阵中。
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
这种结构可能不是最佳的。CCD_ 12结构更适合应用类似CCD_ 13的功能。它的运行速度也更快,因为您不必经过do.call(rbind, ...)
。
# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
以下是您的实现:
# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
mat_expand <- ct_mat_cdf[agent_cts,]
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
在我的桌面上,运行大约需要2.68秒。
> system.time(GetCTMove(agents, mat_cdf))
user system elapsed
2.25 0.41 2.68
使用list
结构和which
功能,运行时间可以减少约1s。
# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
n <- length(agent_cts)
s <- runif(n)
out <- integer(n)
i <- 1L
while (i <= n) {
out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
i <- i + 1L
}
out
}
> system.time(GetCTMove2(agents, ls_cdf))
user system elapsed
1.59 0.02 1.64
据我所知,只有R,没有其他方法可以进一步加快代码的速度。然而,您确实可以通过在C++中重写关键函数GetCTMove
来提高性能。使用Rcpp
包,您可以执行以下操作:
# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
int n = agents.size();
NumericVector out(n);
for (int i = 0; i < n; ++i) {
NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]);
int m = cdf.size();
for (int j = 0; j < m; ++j) {
if (s[i] < cdf[j]) {
out[i] = j + 1;
break;
}
}
}
return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
s <- runif(length(agent_cts))
fast_GetCTMove(agent_cts, s, ls_cdf)
}
这个实现非常快,可以满足您的所有需求。
> system.time(GetCTMove3(agents, ls_cdf))
user system elapsed
0.07 0.00 0.06
完整的脚本如下所示:
# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
agents <- sample(1:cts, size = 500000, replace = T)
# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
# below is your code
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
mat_expand <- ct_mat_cdf[agent_cts,]
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
n <- length(agent_cts)
s <- runif(n)
out <- integer(n)
i <- 1L
while (i <= n) {
out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
i <- i + 1L
}
out
}
# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
int n = agents.size();
NumericVector out(n);
for (int i = 0; i < n; ++i) {
NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]);
int m = cdf.size();
for (int j = 0; j < m; ++j) {
if (s[i] < cdf[j]) {
out[i] = j + 1;
break;
}
}
}
return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
s <- runif(length(agent_cts))
fast_GetCTMove(agent_cts, s, ls_cdf)
}
system.time(GetCTMove(agents, mat_cdf))
system.time(GetCTMove2(agents, ls_cdf))
system.time(GetCTMove3(agents, ls_cdf))