为什么链器批处理规范化不能很好地与mnist_nn配合使用



Chainer 批处理规范化不适用于我的代码,尽管 TensorFlow 的批处理规范化有效。我使用下面的代码显示的数据集 mnist。

使用 chainer(version=6.1.0(,如果没有批处理归一化,验证精度在 100 个 epoch 后在 0.97 和 0.98 之间,而对于批处理归一化,它在 100 个 epoch 后小于 0.80。

当我以相同的方式使用张量流(版本=1.14.0(时,双向验证精度约为 0.98,使用批量归一化或不进行批量归一化。

这是我代码的一部分。 纪元数为 100,其批大小为 1000。我使用 Adam 作为优化器,learning_rate 0.01。

数据集、训练数据、验证数据

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)
x_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
x_val = test_images.astype('float32')/255
y_val = test_labels.astype('int32')

型号和条件(链式(

# Define model
class MyModel(Chain):
def __init__(self,n_in=784,n_hidden=100,n_out=10):
initializer = chainer.initializers.HeNormal()
super().__init__()
with self.init_scope():
self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
self.bn=L.BatchNormalization(n_hidden, decay=0.99, eps=0.001)
def forward(self,x):
h = F.relu(self.bn(self.l1(x)))
h = F.relu(self.bn(self.l2(h)))
return self.l3(h)
model = MyModel()
optimizer = optimizers.Adam()
optimizer.setup(model)       
n_epoch = 100
n_batchsize = 1000 

模型和条件(张量流(

n_inputs = 28 * 28
n_hidden1 = 100
n_hidden2 = 100
n_outputs = 10

batch_norm_momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
with tf.name_scope("dnn"):
he_init = tf.variance_scaling_initializer()
my_batch_norm_layer = partial(tf.layers.batch_normalization, 
training=training,
momentum=batch_norm_momentum)
my_dense_layer = partial(tf.layers.dense,
kernel_initializer=he_init)
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
bn1 = tf.nn.relu(my_batch_norm_layer(hidden1))
hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
bn2 = tf.nn.relu(my_batch_norm_layer(hidden2))
logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

我与链接器一起使用的所有代码

import numpy as np
import chainer
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
from keras.datasets import mnist
import cupy as cp

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)
x_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
x_val = test_images.astype('float32')/255
y_val = test_labels.astype('int32')

# Define model
class MyModel(Chain):
def __init__(self,n_in=784,n_hidden=100,n_out=10):
initializer = chainer.initializers.HeNormal()
super().__init__()
with self.init_scope():
self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
self.bn=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)
def forward(self,x):
h = F.relu(self.bn(self.l1(x)))
h = F.relu(self.bn(self.l2(h)))
return self.l3(h)
# define optimizer
model = MyModel()
optimizer = optimizers.Adam(alpha=0.01)
optimizer.setup(model)       
## learn network
n_epoch = 100
n_batchsize = 1000 
iteration = 0
gpu_id = 0
cuda.get_device(gpu_id).use()
# send the network to gpu memory
model.to_gpu(gpu_id)
print("epoch  train/loss  val/loss  train/acc   val/acc")
for epoch in range(n_epoch):
# order dataset randomly
order = np.random.permutation(range(len(x_train)))
loss_list = []
accuracy_list = []
for i in range(0, len(order), n_batchsize):
index = order[i:i+n_batchsize]
x_train_batch = x_train[index,:]
y_train_batch = y_train[index]
x_train_batch = cp.asarray(x_train_batch)
y_train_batch = cp.asarray(y_train_batch)
output_train = model(x_train_batch)
loss_train_batch = F.softmax_cross_entropy(output_train, y_train_batch)
accuracy_train_batch = F.accuracy(output_train, y_train_batch)
loss_list.append(cuda.to_cpu(loss_train_batch.array))
accuracy_list.append(cuda.to_cpu(accuracy_train_batch.array))
model.cleargrads()
loss_train_batch.backward()
optimizer.update()
iteration += 1
loss_train = np.mean(loss_list)
accuracy_train = np.mean(accuracy_list)
# after one epoch, evaluate with validation data
x_val = cp.asarray(x_val)
y_val = cp.asarray(y_val)
with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
output_val = model(x_val)
loss_val = F.softmax_cross_entropy(output_val, y_val)
loss_val = cuda.to_cpu(loss_val.array)
accuracy_val = F.accuracy(output_val, y_val)
accuracy_val = cuda.to_cpu(accuracy_val.array)
print('{0:>4d} {1:>10.4f} {2:>10.4f} {3:>10.4f} {4:>10.4f}'.format(epoch,loss_train,loss_val,accuracy_train,accuracy_val))

我与张量流一起使用的所有代码

python
import tensorflow as tf
from keras.datasets import mnist
from functools import partial
import numpy as np
def shuffle_batch(X, y, batch_size):
rnd_idx = np.random.permutation(len(X))
n_batches = len(X) // batch_size
for batch_idx in np.array_split(rnd_idx, n_batches):
X_batch, y_batch = X[batch_idx], y[batch_idx]
yield X_batch, y_batch
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)
X_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
X_valid = test_images.astype('float32')/255
y_valid = test_labels.astype('int32')
n_inputs = 28 * 28
n_hidden1 = 100
n_hidden2 = 100
n_outputs = 10
batch_norm_momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
with tf.name_scope("dnn"):
he_init = tf.variance_scaling_initializer()
my_batch_norm_layer = partial(tf.layers.batch_normalization, 
training=training,
momentum=batch_norm_momentum)
my_dense_layer = partial(tf.layers.dense)
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
bn1 = tf.nn.relu(my_batch_norm_layer(hidden1))
hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
bn2 = tf.nn.relu(my_batch_norm_layer(hidden2))
logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 100
batch_size = 1000
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
print("epoch  train/loss  val/loss  train/acc   val/acc")
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
loss_list = []
accuracy_list = []
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run([training_op, extra_update_ops], 
feed_dict={training: True, X: X_batch, y: y_batch})
loss_batch = loss.eval(feed_dict={X: X_batch, y: y_batch})
accuracy_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
loss_list.append(loss_batch)
accuracy_list.append(accuracy_batch)
loss_val = loss.eval(feed_dict={X: X_valid, y: y_valid}) 
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print('{0:>4d} {1:>10.4f} {2:>10.4f} {3:>10.4f} {4:>10.4f}'
.format(epoch,np.mean(loss_list),loss_val,np.mean(accuracy_list),accuracy_val))

我预计使用链式器的批量标准化将在 98% 左右,但不到 80%。 我是否以错误的方式使用链式批处理归一化,或者链式和张量流之间的批处理归一化结构差异很大?

为了在层之间使用不同的批处理统计信息,模型定义必须类似于以下代码,在我的环境中,在 100 个 epoch 之后实现了 98% 的验证准确率。

class MyModel(Chain):
def __init__(self,n_in=784,n_hidden=100,n_out=10):
initializer = chainer.initializers.HeNormal()
super().__init__()
with self.init_scope():
self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
self.bn1=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)
self.bn2=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)
def forward(self,x):
h = F.relu(self.bn1(self.l1(x)))
h = F.relu(self.bn2(self.l2(h)))
return self.l3(h)

最新更新