在汇总输出中,交叉验证数据的MSE为0.1641124,但在详细的交叉验证度量汇总中为0.14977892。它们不是相同的指标吗?
library(h2o)
h <- h2o.init()
data <- as.h2o(iris)
part <- h2o.splitFrame(data, 0.7, seed = 123)
train <- part[[1]]
test <- part[[2]]
m <- h2o.glm(x=2:5,y=1,train, nfolds = 10, seed = 123)
summary(m)
#...
#H2ORegressionMetrics: glm
#** Reported on cross-validation data. **
#** 10-fold cross-validation on training data (Metrics computed for combined
#holdout predictions) **
#MSE: ***0.1641124***
#RMSE: 0.4051079
#...
#Cross-Validation Metrics Summary:
# mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid cv_6_valid cv_7_valid cv_8_valid cv_9_valid
#...
# mse ***0.14977892*** 0.053578787 0.14102486 0.14244498 0.05266633 0.19028585 0.043878503 0.12635022 0.13820939 0.15831167 0.33359975
这两个MSE值的计算方式不同。
第一个(0.1641124(是在交叉验证期间使用保留集上的所有预测计算的:
创建模型:
m <- h2o.glm(x = 2:5,
y = 1,
train,
nfolds = 10,
seed = 123,
keep_cross_validation_predictions = TRUE,
keep_cross_validation_fold_assignment = TRUE)
提取保持预测
preds <- as.data.frame(h2o.cross_validation_holdout_predictions(m))
计算MSE:
mean((preds$predict - as.data.frame(train)$Sepal.Length)^2)
#output
0.1641125
其中较低的MSE(0.14977892(表示每个保持集的MSE的平均值:
folds <- as.data.frame(h2o.cross_validation_fold_assignment(m))
library(tidyverse)
data.frame(preds = preds$predict, #create a data frame with hold out predictions
folds = folds$fold_assignment, #folds assignement
true = as.data.frame(train)$Sepal.Length) %>% #true values
group_by(folds) %>% #group by folds
summarise(mse = mean((preds - true)^2)) %>% # calculate mse for each fold
ungroup() %>%
summarise(mse = mean(mse)) %>% #average them
as.numeric
#output
0.1497789
再现第一次运行:
library(h2o)
h <- h2o.init()
data <- as.h2o(iris)
part <- h2o.splitFrame(data, 0.7, seed = 123)
train <- part[[1]]
test <- part[[2]]