r-当数据中存在异常值时,为什么OLS回归给出最低的MSE结果



我正在处理回归模型(普通最小二乘、Huber回归、MM估计和岭回归(。我想检查一下哪个模型同时对异常值和多重共线性更稳健

然而,与其他回归模型相比,当数据中存在异常值和多重共线性时,OLS回归给出的MSE结果最低。

我的代码有什么问题吗?

R-代码

library(MASS)
library(glmnet)
### Calling the important functions ###
# Mean Square meausre: MSE#
mse=function(x){
mmm=rep(0,ncol(x))
for (i in 1:ncol(x)){
mmm[i]=mean((x[,i])^2)
}
return(mmm)
}
# Mean Absloute Deviation measure: MAD#
mad=function(x){
mmm=rep(0,ncol(x))
for (i in 1:ncol(x)){
mmm[i]=mean(abs(x[,i]))
}
return(mmm)
}
# mean of the results ##
mee=function(x){
mmm=rep(0,ncol(x))
for (i in 1:ncol(x)){
mmm[i]=mean((x[,i]))
}
return(mmm)
}
umar <- function(R,n,sig,p,po,py,fx,fy){
#' where 'R is the level of multicollinearity between 0 and 1'#
#' "n" is the sample size
#' "sig" is the error vatiance
#' "p" is the number of explanaitory variable
#' 'po' is percentage outlier in x direction
#'  'py' is percentage outlier in y direction
#' 'fx' is magnitude of outlier in x direction
#' 'fy' is magnitude of outlier in y direction'#
#' RR' is the number of replication 

RR=20      
set.seed(123)

OP2=NULL
OP3=NULL

#explanatory vriables

x=matrix(0,nrow=n,ncol=p)
W <-matrix(rnorm(n*(p+1),mean=0,sd=1), n, p+1)  
for (i in 1:n){
for (j in 1:p){
x[i,j] <- sqrt(1-R^2)*W[i,j]+(R)*W[i,p+1];      # Introduce multicollinearity
}    
}

b=eigen(t(x)%*%x)$vec[,1]

#Invoking outlier
rep1=sample(1:n, size=po*n, replace=FALSE)
x[rep1,2]=fx*max(x[,2])+x[rep1,2]     # the point of outlier
for (i in 1:RR){
u=rnorm(n,0,sig)
y=x%*%b+u
rep2=sample(1:n, size=py*n, replace=FALSE)
y[rep2]=fy*max(y)+y[rep2]

dat=data.frame(y,x)
n=nrow(dat)

# K-fold Cross validation
#Create k equally size folds

k=3 # number of folds
folds <- cut(seq(1,n),breaks=k,labels=FALSE)

mols=matrix(0,nrow= k);
mM=matrix(0,nrow= k);mMM=matrix(0,nrow= k);
mrls=matrix(0,nrow= k);mrm=matrix(0,nrow= k);mrmm=matrix(0,nrow= k);
mols2=matrix(0,nrow= k);
mM2=matrix(0,nrow= k);mMM2=matrix(0,nrow= k)
mrls2=matrix(0,nrow= k);mrm2=matrix(0,nrow= k);mrmm2=matrix(0,nrow= k);

#Perform 3 fold cross validation

for(i in 1:k){
#Segement your data by fold using the which() function 
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- dat[testIndexes, ]
trainData <- dat[-testIndexes, ]
xtr=as.matrix(trainData[,-1])
ytr=trainData[,1]
xte=as.matrix(testData[,-1])
yte=testData[,1]

mest=rlm(ytr~xtr,psi=psi.huber,k2=1.345,maxit=1000)$coefficients  # Huber Regression 

mmest=rlm(ytr~xtr,method="MM",maxit = 1000)$coefficients  # MM Estimators 

ols=lm(ytr~xtr)$coefficients     # OLS Regression 

nxtr=model.matrix(~xtr)

ridge.fit.cv <- cv.glmnet(nxtr, ytr, alpha = 0, standardize = FALSE, intercept = TRUE)
ridge.fit.lambda <- ridge.fit.cv$lambda.1se

I=diag(1,ncol(nxtr))
ridols=solve(t(nxtr)%*%nxtr+ridge.fit.lambda*I)%*%(t(nxtr)%*%nxtr)%*%ols  # Ridge Regression 
mrls[i]=mean(yte-cbind(1,xte)%*%ridols)^2
ridM=solve(t(nxtr)%*%nxtr+ridge.fit.lambda*I)%*%(t(nxtr)%*%nxtr)%*%mest # Ridge Huber 
mrm[i]=mean(yte-cbind(1,xte)%*%ridM)^2
ridMM=solve(t(nxtr)%*%nxtr+ridge.fit.lambda*I)%*%(t(nxtr)%*%nxtr)%*%mmest # Ridge MM
mrmm[i]=mean(yte-cbind(1,xte)%*%ridMM)^2

mols[i]=mean(yte-cbind(1,xte)%*%ols)^2
mM[i]=mean(yte-cbind(1,xte)%*%mest)^2
mMM[i]=mean(yte-cbind(1,xte)%*%mmest)^2

mrls2[i]=mean(abs(yte-cbind(1,xte)%*%ridols))
mrm2[i]=mean(abs(yte-cbind(1,xte)%*%ridM))
mrmm2[i]=mean(abs(yte-cbind(1,xte)%*%ridMM))
mols2[i]=mean(abs(yte-cbind(1,xte)%*%ols))
mM2[i]=mean(abs(yte-cbind(1,xte)%*%mest))
mMM2[i]=mean(abs(yte-cbind(1,xte)%*%mmest))

}

res1=cbind(mols,mM,mMM,
mrls,mrm,mrmm)

res3=cbind(mols2,mM2,mMM2,
mrls2,mrm2,mrmm2)

op2=mse(res1)
OP2=cbind(OP2,op2)
op3=mad(res3)
OP3=cbind(OP3,op3)

}

MSE=mee(t(OP2))
MAD=mee(t(OP3))



nam=c("OLS","M","MM","Ridge-OLS","Ridge-M","Ridge-MM")

data.frame(nam,R,n,sig,p,po,py,fx,fy,MAD,MSE)
}

results=NULL
R=c(0.999)
n=c(100)
sig=c(5)
p=c(3)
po=c(0.2)
py=c(0.2)
fx=c(5)
fy=c(5)
for(i in 1:length(R)){
for(j in 1:length(n)){
for(k in 1:length(sig)){
for(l in 1:length(p)){
for(m in 1:length(po)){
for(nn in 1:length(py)){
for(o in 1:length(fx)){
for(pp in 1:length(fy)){
results=rbind(results,umar(R=R[i],n=n[j],sig=sig[k],p=p[l],
po=po[m],py=py[nn],fx=fx[o],fy=fy[pp]))
}
}
}
}
}
}
}
}
View(results)

我没有仔细阅读您的代码。如果您正在使用稳健的优化,您也应该使用稳健的措施,否则您将无法实现目标。

我将尝试用一个简单的例子来展示这一点,只有一个案例,没有CV。假设这些随机数据的最后一点是一个巨大的异常值。

set.seed(1)
x=1:100
y=x+rnorm(100)
y[100]=1000

现在我们拟合OLS并估计MSE

mean((predict(lm(y~x))-y)^2)
[1] 7779.713

和一个鲁棒的线性模型

library(MASS)
mean((predict(rlm(y~x,method="MM"))-y)^2)
[1] 8099.502

正如您所看到的,稳健模型比常规OLS模型具有更高的MSE。因为这正是OLS正在最小化的!均方误差。而稳健模型优化了不同的成本/损失函数。因此,OLS返回最佳结果并不奇怪。

如开头所述,如果您正在进行稳健优化,则应该使用稳健度量。如果你检查两个模型的MdAE,你会发现稳健模型表现更好(同样,很明显,因为这是它的目标(。

> median(abs(predict(lm(y~x))-y))
[1] 13.57675
> median(abs(predict(rlm(y~x,method="MM"))-y))
[1] 0.6008375

最新更新