一些背景:R
中的nlm
函数是使用牛顿方法的通用优化例程。为了优化函数,牛顿方法需要函数,以及函数的一阶和二阶导数(分别为梯度向量和黑森矩阵)。R
nlm
函数允许您指定对应于梯度和Hessian计算的R函数,或者可以保留这些未指定,并且基于数值导数(通过deriv
函数)提供数值解。通过提供计算梯度和Hessian的函数,可以找到更准确的解决方案,因此这是一个有用的功能。
我的问题是:nlm
函数较慢,并且在提供解析Hessian时经常无法在合理的时间内收敛。我猜这是底层代码中的某种错误,但我很高兴错了。有没有办法使nlm
更好地使用分析黑森矩阵?
示例:下面的R
代码使用逻辑回归示例演示了此问题,其中
log(Pr(Y=1)/Pr(Y=0)) = b0 + Xb
其中 X 是 N 维乘 p 的多元法线,b 是长度为 p 的系数向量。
library(mvtnorm)
# example demonstrating a problem with NLM
expit <- function(mu) {1/(1+exp(-mu))}
mk.logit.data <- function(N,p){
set.seed(1232)
U = matrix(runif(p*p), nrow=p, ncol=p)
S = 0.5*(U+t(U)) + p*diag(rep(1,p))
X = rmvnorm(N, mean = runif(p, -1, 1), sigma = S)
Design = cbind(rep(1, N), X)
beta = sort(sample(c(rep(0,p), runif(1))))
y = rbinom(N, 1, expit(Design%*%beta))
list(X=X,y=as.numeric(y),N=N,p=p)
}
# function to calculate gradient vector at given coefficient values
logistic_gr <- function(beta, y, x, min=TRUE){
mu = beta[1] + x %*% beta[-1]
p = length(beta)
n = length(y)
D = cbind(rep(1,n), x)
gri = matrix(nrow=n, ncol=p)
for(j in 1:p){
gri[,j] = D[,j]*(exp(-mu)*y-1+y)/(1+exp(-mu))
}
gr = apply(gri, 2, sum)
if(min) gr = -gr
gr
}
# function to calculate Hessian matrix at given coefficient values
logistic_hess <- function(beta, y, x, min=TRUE){
# allow to fail with NA, NaN, Inf values
mu = beta[1] + x %*% beta[-1]
p = length(beta)
n = length(y)
D = cbind(rep(1,n), x)
h = matrix(nrow=p, ncol=p)
for(j in 1:p){
for(k in 1:p){
h[j,k] = -sum(D[,j]*D[,k]*(exp(-mu))/(1+exp(-mu))^2)
}
}
if(min) h = -h
h
}
# function to calculate likelihood (up to a constant) at given coefficient values
logistic_ll <- function(beta, y,x, gr=FALSE, he=FALSE, min=TRUE){
mu = beta[1] + x %*% beta[-1]
lli = log(expit(mu))*y + log(1-expit(mu))*(1-y)
ll = sum(lli)
if(is.na(ll) | is.infinite(ll)) ll = -1e16
if(min) ll=-ll
# the below specification is required for using analytic gradient/Hessian in nlm function
if(gr) attr(ll, "gradient") <- logistic_gr(beta, y=y, x=x, min=min)
if(he) attr(ll, "hessian") <- logistic_hess(beta, y=y, x=x, min=min)
ll
}
第一个例子,p=3:
dat = mk.logit.data(N=100, p=3)
glm
函数估计值仅供参考。nlm
应该给出相同的答案,允许由于近似而导致的小误差。
(glm.sol <- glm(dat$y~dat$X, family=binomial()))$coefficients
> (Intercept) dat$X1 dat$X2 dat$X3
> 0.00981465 0.01068939 0.04417671 0.01625381
# works when correct analytic gradient is specified
(nlm.sol1 <- nlm(p=runif(dat$p+1), f=logistic_ll, gr=TRUE, y=dat$y, x=dat$X))$estimate
> [1] 0.009814547 0.010689396 0.044176627 0.016253966
# works, but less accurate when correct analytic hessian is specified (even though the routine notes convergence is probable)
(nlm.sol2 <- nlm(p=runif(dat$p+1), f=logistic_ll, gr=TRUE, he=TRUE, y=dat$y, x=dat$X, hessian = TRUE, check.analyticals=TRUE))$estimate
> [1] 0.009827701 0.010687278 0.044178416 0.016255630
但是当 p 变大时,问题变得很明显,这里是 10
dat = mk.logit.data(N=100, p=10)
同样,glm
解决方案以供参考。nlm
应该给出相同的答案,允许由于近似而出现小误差。
(glm.sol <- glm(dat$y~dat$X, family=binomial()))$coefficients
> (Intercept) dat$X1 dat$X2 dat$X3 dat$X4 dat$X5 dat$X6 dat$X7
> -0.07071882 -0.08670003 0.16436630 0.01130549 0.17302058 0.03821008 0.08836471 -0.16578959
> dat$X8 dat$X9 dat$X10
> -0.07515477 -0.08555075 0.29119963
# works when correct analytic gradient is specified
(nlm.sol1 <- nlm(p=runif(dat$p+1), f=logistic_ll, gr=TRUE, y=dat$y, x=dat$X))$estimate
> [1] -0.07071879 -0.08670005 0.16436632 0.01130550 0.17302057 0.03821009 0.08836472
> [8] -0.16578958 -0.07515478 -0.08555076 0.29119967
# fails to converge in 5000 iterations when correct analytic hessian is specified
(nlm.sol2 <- nlm(p=runif(dat$p+1), f=logistic_ll, gr=TRUE, he=TRUE, y=dat$y, x=dat$X, hessian = TRUE, iterlim=5000, check.analyticals=TRUE))$estimate
> [1] 0.31602065 -0.06185190 0.10775381 -0.16748897 0.05032156 0.34176104 0.02118631
> [8] -0.01833671 -0.20364929 0.63713991 0.18390489
编辑:我还应该补充一点,我已经通过多种不同的方法确认我有正确的黑森矩阵
我尝试了代码,但起初它似乎使用了与我在 CRAN 上找到的不同 rmvnorm。我在dae包中找到了一个rmvnorm,然后在mvtnorm包中找到了一个。后者是要使用的。
nlm() 在上述发布的时间进行了修补。我目前正在尝试验证补丁,现在它似乎工作正常。 请注意,我是许多 R 优化代码的作者,包括 optim() 中的 3/5。
纳什杰克在uottawa.ca
代码如下。
修订后的代码:
# example demonstrating a problem with NLM
expit <- function(mu) {1/(1+exp(-mu))}
mk.logit.data <- function(N,p){
set.seed(1232)
U = matrix(runif(p*p), nrow=p, ncol=p)
S = 0.5*(U+t(U)) + p*diag(rep(1,p))
X = rmvnorm(N, mean = runif(p, -1, 1), sigma = S)
Design = cbind(rep(1, N), X)
beta = sort(sample(c(rep(0,p), runif(1))))
y = rbinom(N, 1, expit(Design%*%beta))
list(X=X,y=as.numeric(y),N=N,p=p)
}
# function to calculate gradient vector at given coefficient values
logistic_gr <- function(beta, y, x, min=TRUE){
mu = beta[1] + x %*% beta[-1]
p = length(beta)
n = length(y)
D = cbind(rep(1,n), x)
gri = matrix(nrow=n, ncol=p)
for(j in 1:p){
gri[,j] = D[,j]*(exp(-mu)*y-1+y)/(1+exp(-mu))
}
gr = apply(gri, 2, sum)
if(min) gr = -gr
gr
}
# function to calculate Hessian matrix at given coefficient values
logistic_hess <- function(beta, y, x, min=TRUE){
# allow to fail with NA, NaN, Inf values
mu = beta[1] + x %*% beta[-1]
p = length(beta)
n = length(y)
D = cbind(rep(1,n), x)
h = matrix(nrow=p, ncol=p)
for(j in 1:p){
for(k in 1:p){
h[j,k] = -sum(D[,j]*D[,k]*(exp(-mu))/(1+exp(-mu))^2)
}
}
if(min) h = -h
h
}
# function to calculate likelihood (up to a constant) at given coefficient values
logistic_ll <- function(beta, y,x, gr=FALSE, he=FALSE, min=TRUE){
mu = beta[1] + x %*% beta[-1]
lli = log(expit(mu))*y + log(1-expit(mu))*(1-y)
ll = sum(lli)
if(is.na(ll) | is.infinite(ll)) ll = -1e16
if(min) ll=-ll
# the below specification is required for using analytic gradient/Hessian in nlm function
if(gr) attr(ll, "gradient") <- logistic_gr(beta, y=y, x=x, min=min)
if(he) attr(ll, "hessian") <- logistic_hess(beta, y=y, x=x, min=min)
ll
}
##!!!! NOTE: Must have this library loaded
library(mvtnorm)
dat = mk.logit.data(N=100, p=3)
(glm.sol <- glm(dat$y~dat$X, family=binomial()))$coefficients
# works when correct analytic gradient is specified
(nlm.sol1 <- nlm(p=runif(dat$p+1), f=logistic_ll, gr=TRUE, y=dat$y, x=dat$X))$estimate
# works, but less accurate when correct analytic hessian is specified (even though the routine notes convergence is probable)
(nlm.sol2 <- nlm(p=runif(dat$p+1), f=logistic_ll, gr=TRUE, he=TRUE, y=dat$y, x=dat$X, hessian = TRUE, check.analyticals=TRUE))$estimate
dat = mk.logit.data(N=100, p=10)
# Again, glm solution for reference. nlm should give the same answer, allowing for small errors due to approximation.
(glm.sol <- glm(dat$y~dat$X, family=binomial()))$coefficients
# works when correct analytic gradient is specified
(nlm.sol1 <- nlm(p=runif(dat$p+1), f=logistic_ll, gr=TRUE, y=dat$y, x=dat$X))$estimate
# fails to converge in 5000 iterations when correct analytic hessian is specified
(nlm.sol2 <- nlm(p=runif(dat$p+1), f=logistic_ll, gr=TRUE, he=TRUE, y=dat$y, x=dat$X, hessian = TRUE, iterlim=5000, check.analyticals=TRUE))$estimate