我使用h2o给出的例子进行心电图异常检测。当尝试手动计算MSE时,我得到了不同的结果。为了证明差异,我使用了上一个测试用例但所有23例病例不同。附完整代码:
谢谢,伊莱。
suppressMessages(library(h2o))
localH2O = h2o.init(max_mem_size = '6g', # use 6GB of RAM of *GB available
nthreads = -1) # use all CPUs (8 on my personal computer :3)
# Download and import ECG train and test data into the H2O cluster
train_ecg <- h2o.importFile(path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_train.csv",
header = FALSE,
sep = ",")
test_ecg <- h2o.importFile(path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_test.csv",
header = FALSE,
sep = ",")
# Train deep autoencoder learning model on "normal"
# training data, y ignored
anomaly_model <- h2o.deeplearning(x = names(train_ecg),
training_frame = train_ecg,
activation = "Tanh",
autoencoder = TRUE,
hidden = c(50,20,50),
l1 = 1e-4,
epochs = 100)
# Compute reconstruction error with the Anomaly
# detection app (MSE between output layer and input layer)
recon_error <- h2o.anomaly(anomaly_model, test_ecg)
# Pull reconstruction error data into R and
# plot to find outliers (last 3 heartbeats)
recon_error <- as.data.frame(recon_error)
recon_error
plot.ts(recon_error)
test_recon <- h2o.predict(anomaly_model, test_ecg)
t <- as.vector(test_ecg[23,])
r <- as.vector(test_recon[23,])
mse.23 <- sum((t-r)^2)/length(t)
mse.23
recon_error[23,]
> mse.23
[1] 2.607374
> recon_error[23,]
[1] 8.264768
这不是一个真正的答案,但我做了@Arno Candel的建议。我尝试将测试和训练数据结合起来,并将其规范化为0-1。之后,我将组合和规范化的数据拆分回测试和训练数据,并运行OP生成的脚本。然而,我仍然使用手动计算获得不同的MSE。当我分别对测试和训练数据进行归一化时,MSE也不同。我能做些什么来正确地进行手动计算吗?
suppressMessages(library(purrr))
suppressMessages(library(dplyr))
suppressMessages(library(h2o))
localH2O = h2o.init(max_mem_size = '6g', # use 6GB of RAM of *GB available
nthreads = -1) # use all CPUs (8 on my personal computer :3)
# Download and import ECG train and test data into the H2O cluster
train_ecg <- h2o.importFile(path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_train.csv",
header = FALSE,
sep = ",")
test_ecg <- h2o.importFile(path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_test.csv",
header = FALSE,
sep = ",")
### adding this section
# normalize data
train_ecg <- as.data.frame(train_ecg)
test_ecg <- as.data.frame(test_ecg)
dat <- rbind(train_ecg,test_ecg)
get_desc <- function(x) {
map(x, ~list(
min = min(.x),
max = max(.x),
mean = mean(.x),
sd = sd(.x)
))
}
normalization_minmax <- function(x, desc) {
map2_dfc(x, desc, ~(.x - .y$min)/(.y$max - .y$min))
}
desc <- dat %>%
get_desc()
dat <- dat %>%
normalization_minmax(desc)
train_ecg <- as.matrix(dat[1:20,]) ; test_ecg <- as.matrix(dat[21:43,])
# Train deep autoencoder learning model on "normal"
# training data, y ignored
anomaly_model <- h2o.deeplearning(x = names(train_ecg),
training_frame = train_ecg,
activation = "Tanh",
autoencoder = TRUE,
hidden = c(50,20,50),
l1 = 1e-4,
epochs = 100)
# Compute reconstruction error with the Anomaly
# detection app (MSE between output layer and input layer)
recon_error <- h2o.anomaly(anomaly_model, test_ecg)
# Pull reconstruction error data into R and
# plot to find outliers (last 3 heartbeats)
recon_error <- as.data.frame(recon_error)
recon_error
plot.ts(recon_error)
test_recon <- h2o.predict(anomaly_model, test_ecg)
t <- as.vector(test_ecg[23,])
r <- as.vector(test_recon[23,])
mse.23 <- sum((t-r)^2)/length(t)
mse.23
recon_error[23,]
> mse.23
[1] 23.14947
> recon_error[23,]
[1] 8.076866
对于H2O中的自动编码器,MSE数学是在归一化空间中进行的,以避免数值缩放问题。例如,如果你有分类特征或非常大的数字,神经网络自动编码器不能直接对这些数字进行操作,但它首先对数字特征进行伪一次热编码和归一化,然后进行正向/反向传播和重建误差的计算(在归一化和扩展空间中)。对于纯数值数据,您可以先手动将每列除以其范围(最大值-最小值),结果应该匹配。
下面是一个JUnit,它显式地(在该数据集上)执行此检查:https://github.com/h2oai/h2o-3/blob/master/h2o-algos/src/test/java/hex/deeplearning/DeepLearningAutoEncoderTest.java#L86-L104
您还可以看到https://0xdata.atlassian.net/browse/PUBDEV-2078了解更多信息。