我正在使用Tensorflow编写手写文本预测的代码。首先关注用于数字预测的MNIST数据集,我创建了一个适合此任务的网络。但是,当调整代码以识别字母时,我Nan
损失。你能帮我解决这个问题吗?
n_node_hl1 = 100
n_node_hl2 = 100
n_node_hl3 = 100
n_classes = 62
hidden_layer_1 = {'weight': tf.Variable(tf.random_normal([4096, n_node_hl1])),
'bias': tf.Variable(tf.random_normal([n_node_hl1]))}
hidden_layer_2 = {'weight': tf.Variable(tf.random_normal([n_node_hl1, n_node_hl2])),
'bias': tf.Variable(tf.random_normal([n_node_hl2]))}
hidden_layer_3 = {'weight': tf.Variable(tf.random_normal([n_node_hl2, n_node_hl3])),
'bias': tf.Variable(tf.random_normal([n_node_hl3]))}
output = {'weight': tf.Variable(tf.random_normal([n_node_hl3, n_classes])),
'bias': tf.Variable(tf.random_normal([n_classes]))}
def neural_network_model(data):
l1 = tf.add(tf.matmul(data, hidden_layer_1['weight']), hidden_layer_1['bias'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1,hidden_layer_2['weight']) , hidden_layer_2['bias'])
l2 = tf.nn.relu(l2)
l3 = tf.add(tf.matmul(l2,hidden_layer_3['weight']) , hidden_layer_3['bias'])
l3 = tf.nn.relu(l3)
out = tf.matmul(l3, output['weight'])+ output['bias']
return out
x_train, y_train = TRAIN_SIZE(2850)
x_test, y_test = TRAIN_SIZE(1900)
with tf.name_scope("MNIST_Input"):
x = tf.placeholder(tf.float32, shape=[None, 4096], name='x')
y_ = tf.placeholder(tf.float32, shape=[None, 62], name='y')
y = neural_network_model(x)
with tf.Session() as sess:
LEARNING_RATE = 0.2
TRAIN_STEPS = 200
sess.run(tf.global_variables_initializer())
with tf.name_scope("cross_entropy"):
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(neural_network_model(x)), reduction_indices=[1]))
with tf.name_scope("loss_optimiser"):
training = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(cross_entropy)
with tf.name_scope("accuracy"):
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tbWritwr = tf.summary.FileWriter(logpath, sess.graph)
for i in range(TRAIN_STEPS+1):
sess.run(training, feed_dict={x: x_train, y_: y_train})
if i%10 == 0:
print('Training Step:' + str(i) + ' Accuracy = ' + str(sess.run(accuracy, feed_dict={x: x_test, y_: y_test})) + ' Loss = ' + str(sess.run(cross_entropy, {x: x_train, y_: y_train})))
# print('Training Step:' + str(i) + ' Loss = ' + str(sess.run(cross_entropy, {x: x_train, y_: y_train})))
savedPath = tf.train.Saver().save(sess, "/tmp/model.ckpt")
print("Model saved at: " ,savedPath)
我得到的输出是:
Total Training Images in Dataset = (2852, 4096)
--------------------------------------------------
x_train Examples Loaded = (2850, 4096)
y_train Examples Loaded = (2850, 62)
Total Training Images in Dataset = (2852, 4096)
--------------------------------------------------
x_train Examples Loaded = (1900, 4096)
y_train Examples Loaded = (1900, 62)
TensorFlow binary was not compiled to use: AVX AVX2
Total testing Images in Dataset = (558, 4096)
--------------------------------------------------
x_test Examples Loaded = (400, 4096)
y_test Examples Loaded = (400, 62)
Training Step:0 Accuracy = 0.02 Loss = nan
Training Step:10 Accuracy = 0.02 Loss = nan
Training Step:20 Accuracy = 0.03 Loss = nan
Training Step:30 Accuracy = 0.03 Loss = nan
Training Step:40 Accuracy = 0.03 Loss = nan
Training Step:50 Accuracy = 0.03 Loss = nan
Training Step:60 Accuracy = 0.03 Loss = nan
Training Step:70 Accuracy = 0.03 Loss = nan
Training Step:80 Accuracy = 0.03 Loss = nan
Training Step:90 Accuracy = 0.03 Loss = nan
Training Step:100 Accuracy = 0.03 Loss = nan
Training Step:110 Accuracy = 0.03 Loss = nan
Training Step:120 Accuracy = 0.03 Loss = nan
Training Step:130 Accuracy = 0.03 Loss = nan
Training Step:140 Accuracy = 0.03 Loss = nan
Training Step:150 Accuracy = 0.03 Loss = nan
Training Step:160 Accuracy = 0.03 Loss = nan
Training Step:170 Accuracy = 0.03 Loss = nan
Training Step:180 Accuracy = 0.03 Loss = nan
Training Step:190 Accuracy = 0.03 Loss = nan
Training Step:200 Accuracy = 0.0275 Loss = nan
你的交叉熵公式是错误的。neural_network_model(x)
不返回 62 个类的概率分布。您可以使用tf.nn.softmax
创建分布,该分布使用指数函数使所有值为正。
logits = neural_network_model(x)
predicted_probabilities = tf.nn.softmax(logits)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(predicted_probabilities), reduction_indices=[1]))
数值稳定性存在一个潜在的问题,因为在调用tf.nn.softmax
中使用指数函数,然后在计算交叉熵时获取日志。一个更好的方法是在张量流中使用内置的softmax交叉熵函数。
@Aaron已经很好地涵盖了有关损失的数值含义的错误,以及为什么您应该考虑原生 Tensorflow 损失,例如tf.losses.sparse_softmax_cross_entropy()
。
我将亲自关注">为什么是 NaN?
可能原因:
由于权重和偏差变量可能采用负值,因此网络输出out = tf.matmul(l3, output['weight'])+ output['bias']
也可能变为负值。当它发生时,tf.log(neural_network_model(x))
将输出NaN
(因为对数未定义空值和负值(,您的cross_entropy
损失也将如此。
确保您的网络输出为正(例如,添加另一个relu
或softmax
+ 在内部添加一个小的正 epsilontf.log()
例如tf.log(out + 1e-8)
( 可能会解决您的问题。
示范:
为了明确我的观点,关于您可能负面的y
因此tf.log(y) == NaN
,请在您稍微编辑的代码下面找到。除了一些优化和输入模拟之外,您的网络架构是相同的。我只编辑了最后一层权重/偏差的初始化(以一种肮脏的方式......(以确保y >= 0
:
import tensorflow as tf
import numpy as np
n_node_hl1 = 100
n_node_hl2 = 100
n_node_hl3 = 100
n_classes = 62
hidden_layer_1 = {'weight': tf.Variable(tf.random_normal([4096, n_node_hl1])),
'bias': tf.Variable(tf.random_normal([n_node_hl1]))}
hidden_layer_2 = {'weight': tf.Variable(tf.random_normal([n_node_hl1, n_node_hl2])),
'bias': tf.Variable(tf.random_normal([n_node_hl2]))}
hidden_layer_3 = {'weight': tf.Variable(tf.random_normal([n_node_hl2, n_node_hl3])),
'bias': tf.Variable(tf.random_normal([n_node_hl3]))}
# >>> CHANGE: MAKING SURE THE OUTPUT VARIABLES HAVE POSITIVE VALUES SO LOG(OUTPUT) != NaN
# note: you do NOT want to do that! tf.abs() here are just to make my point.
output = {'weight': tf.Variable(tf.abs(tf.random_normal([n_node_hl3, n_classes]))),
'bias': tf.Variable(tf.abs(tf.random_normal([n_classes])))}
def neural_network_model(data):
l1 = tf.add(tf.matmul(data, hidden_layer_1['weight']), hidden_layer_1['bias'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1,hidden_layer_2['weight']) , hidden_layer_2['bias'])
l2 = tf.nn.relu(l2)
l3 = tf.add(tf.matmul(l2,hidden_layer_3['weight']) , hidden_layer_3['bias'])
l3 = tf.nn.relu(l3)
out = tf.matmul(l3, output['weight'])+ output['bias']
return out
# >>> CHANGE: MOCKING YOUR DATA:
x_train, y_train = np.random.rand(100, 4096), np.eye(n_classes)[np.random.randint(n_classes, size=100).reshape(-1)]
x_test, y_test = np.random.rand(50, 4096), np.eye(n_classes)[np.random.randint(n_classes, size=50).reshape(-1)]
with tf.name_scope("MNIST_Input"):
x = tf.placeholder(tf.float32, shape=[None, 4096], name='x')
y_ = tf.placeholder(tf.float32, shape=[None, 62], name='y')
y = neural_network_model(x)
with tf.Session() as sess:
LEARNING_RATE = 0.2
TRAIN_STEPS = 200
sess.run(tf.global_variables_initializer())
with tf.name_scope("cross_entropy"):
# >>> CHANGE: REUSING "y" instead of re-instantiating the operations in "neural_network_model(x)":
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
with tf.name_scope("loss_optimiser"):
training = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(cross_entropy)
with tf.name_scope("accuracy"):
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
for i in range(TRAIN_STEPS+1):
sess.run(training, feed_dict={x: x_train, y_: y_train})
if i%10 == 0:
print('Training Step:' + str(i) + ' Accuracy = ' + str(sess.run(accuracy, feed_dict={x: x_test, y_: y_test})) + ' Loss = ' + str(sess.run(cross_entropy, {x: x_train, y_: y_train})))
跟踪:
Training Step:0 Accuracy = 0.04 Loss = -10.7876
Training Step:10 Accuracy = 0.04 Loss = -10.9636
Training Step:20 Accuracy = 0.04 Loss = -11.1214
Training Step:30 Accuracy = 0.04 Loss = -11.2649
Training Step:40 Accuracy = 0.04 Loss = -11.3968
Training Step:50 Accuracy = 0.04 Loss = -11.5175
Training Step:60 Accuracy = 0.04 Loss = -11.6284
Training Step:70 Accuracy = 0.02 Loss = -11.7313
Training Step:80 Accuracy = 0.02 Loss = -11.8277
Training Step:90 Accuracy = 0.02 Loss = -11.9183
Training Step:100 Accuracy = 0.02 Loss = -12.0041
Training Step:110 Accuracy = 0.02 Loss = -12.0853
Training Step:120 Accuracy = 0.02 Loss = -12.1625
Training Step:130 Accuracy = 0.02 Loss = -12.2359
Training Step:140 Accuracy = 0.02 Loss = -12.3058
Training Step:150 Accuracy = 0.02 Loss = -12.3725
Training Step:160 Accuracy = 0.02 Loss = -12.4363
Training Step:170 Accuracy = 0.02 Loss = -12.4973
Training Step:180 Accuracy = 0.02 Loss = -12.5558
Training Step:190 Accuracy = 0.02 Loss = -12.6121
Training Step:200 Accuracy = 0.02 Loss = -12.6662
。没有NaN
.