TensorFlow:余弦差分 ObjFunc 在整个训练过程中的常数



下面的示例是我正在处理的内容的简化版本。我正在尝试找到一个最小化余弦距离的神经网络。我之所以实现自己的余弦差分损失函数,而不是使用Tensorflow的内置方法,是因为在我的项目的完整版本中,它不能完全满足我的要求(尽管在这个简单版本中它们是等效的(。

我将两个正交向量(A 和 B(输入到网络中。我正在尝试减小(A 和 B(之间的余弦距离。它通过最小化损失函数(其中还包括一个在向量 B 被变换时保留其长度的分量(来实现这一点。最终,我的输出应该是一个与向量 A 具有相同方向且长度为向量 B 的向量。

我遇到的问题是网络输出"vector_B_transformed"并且这个向量永远不会改变。我建立的损失函数在整个训练过程中也是恒定的。我尝试以不同的方式初始化权重,但这没有帮助。我从来没有在我的全连接网络的最后一层有 relu 函数,并且在隐藏层上尝试了 relu 激活函数 - 但这似乎没有区别。

我粗略地将结果附加到列表中并将它们打印到终端。纪元减少到200,但在增加时存在同样的问题。

如果有人可以帮助我,将不胜感激,因为我真的被困住了。

import math
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.python.framework import ops
# from utils import *
##### New Helper Functions
# weight and bais wrappers
def weight_variable(name, shape):
"""
Create a weight variable with appropriate initialization
:param name: weight name
:param shape: weight shape
:return: initialized weight variable
"""
initer = tf.truncated_normal_initializer(stddev=0.01)
return tf.get_variable('W_' + name,
dtype=tf.float32,
shape=shape,
initializer=initer)
def bias_variable(name, shape):
"""
Create a bias variable with appropriate initialization
:param name: bias variable name
:param shape: bias variable shape
:return: initialized bias variable
"""
initial = tf.constant(0., shape=shape, dtype=tf.float32)
return tf.get_variable('b_' + name,
dtype=tf.float32,
initializer=initial)

def fc_layer(x, num_units, name, use_relu=True):
"""
Create a fully-connected layer
:param x: input from previous layer
:param num_units: number of hidden units in the fully-connected layer
:param name: layer name
:param use_relu: boolean to add ReLU non-linearity (or not)
:return: The output array
"""
in_dim = x.get_shape()[1]
W = weight_variable(name, shape=[in_dim, num_units])
b = bias_variable(name, [num_units])
layer = tf.matmul(x, W)
layer += b
if use_relu:
layer = tf.nn.relu(layer)
return layer
## loss function
def cosine_distance_simple(A, B):
normalize_A = tf.nn.l2_normalize(A,1)        
normalize_B = tf.nn.l2_normalize(B,1)
distance_matrix = 1 - tf.matmul(normalize_A, normalize_B, transpose_b=True)
distance_matrix = tf.diag_part(distance_matrix)
distance = tf.reduce_sum(distance_matrix)
return distance
def maintain_length(A, B):
return (tf.norm(A) - tf.norm(B))
from __future__ import division
import tensorflow as tf
# generator network without residual block
def generator(vector, reuse=False, name="generator"):
with tf.variable_scope(name):
if reuse:
tf.get_variable_scope().reuse_variables()
else:
assert tf.get_variable_scope().reuse is False
output_dimension = vector.shape[1]
e1 = fc_layer(vector, 2, name='g_e1', use_relu=False)
e2 = fc_layer(e1, 4, name='g_e2', use_relu=False)    
e3 = fc_layer(e2, 8, name='g_e3', use_relu=False)
e4 = fc_layer(e3, 16, name='g_e4', use_relu=False)
e5 = fc_layer(e4, 16, name='g_e5', use_relu=False)
e6 = fc_layer(e5, 8, name='g_e6', use_relu=False)
e7 = fc_layer(e6, 4, name='g_e7', use_relu=False)
e8 = fc_layer(e7, output_dimension, name='g_e8', use_relu=False)

return e8
from __future__ import division
import os
import time
from glob import glob
import tensorflow as tf
import numpy as np
from collections import namedtuple
from sklearn.model_selection import train_test_split
# from module import *
# from utils import *
class cosine_diff_test(object):
def __init__(self, sess, args):
# initialise tensorflow session
self.sess = sess
# data, test, train splits
self.data_A = args.vA
self.data_B = args.vB
self.generator = generator
# when an instance of class cycleGAN is created, build model is automatically called
self._build_model()

def _build_model(self):
#### INPUTS TO NETWORKS
# placeholder for vectors
self.vector_A = tf.placeholder(tf.float32,
[None, 2],
name='vector_A')
self.vector_B = tf.placeholder(tf.float32,
[None, 2],
name='vector_B')
# FCNN to determine vector move required
self.vector_B_ = self.generator(self.vector_B, False, name="generatorB")
# minimise cos_dist between A and B while keeping A same
self.loss = cosine_distance_simple(self.vector_A, self.vector_B_) 
+ maintain_length(self.vector_B, self.vector_B_)
'''
self.loss = abs_criterion(self.vector_A, self.vector_A_) 
+ abs_criterion(self.vector_B, self.vector_B_)
'''
# trainable variables
t_vars = tf.trainable_variables()
# training variables for generator
self.g_vars = [var for var in t_vars if 'generator' in var.name]


def train(self, args):
# placeholder for learning rate
self.lr = tf.placeholder(tf.float32, None, name='learning_rate')
# define optimizer
self.optim = tf.train.AdamOptimizer(self.lr, beta1=args.beta1).minimize(self.loss, var_list=self.g_vars)

# initialise global varibles and run session
init_op = tf.global_variables_initializer()
self.sess.run(init_op)
lr = args.lr
# Import Data
vecA = self.data_A.copy()
vecB = self.data_B.copy()
results_loss = []
results_vector_B_transformed = []
# iterate over the number of epochs definied
for epoch in range(args.epoch):
# Update 
vector_B_transformed, _ = self.sess.run(
[self.vector_B_, self.loss],
feed_dict={self.vector_A: vecA, 
self.vector_B: vecB,  
self.lr: lr})
results_loss.append(_)
results_vector_B_transformed.append(vector_B_transformed)
print(results_loss)
print(results_vector_B_transformed)

origin = args.orig
print('plotting ...')
plt.xlim((-0.5,1.5));
plt.ylim((-0.5,2.5));
plt.quiver(*origin, vecA, vector_B_transformed, 
color=['r','b'],angles='xy', scale_units='xy', scale=1);

class Args():
A_vec = np.array([1, 0]).reshape(1,-1)
B_vec = np.array([0, 2]).reshape(1,-1)
ori = np.array([0, 0]).reshape(1,-1)

epoch = 200
lr = 0.0002
vA = A_vec
vB = B_vec
beta1 = 0.5
orig = ori
args = Args()
# TRAIN
tf.reset_default_graph()
tfconfig = tf.ConfigProto(allow_soft_placement=True)
tfconfig.gpu_options.allow_growth = True
with tf.Session(config=tfconfig) as sess:
model = cosine_diff_test(sess, args)
model.train(args) 

好的,所以我找到了问题,最后是简单的错误:

我的优化工具不在我的更新中

vector_B_transformed, _ = self.sess.run(
[self.vector_B_, self.loss],
feed_dict={self.vector_A: vecA, 
self.vector_B: vecB,  
self.lr: lr})
# Update 
vector_B_transformed, _, loss = self.sess.run(
[self.vector_B_, self.optim, self.loss],
feed_dict={self.vector_A: vecA, 
self.vector_B: vecB,  
self.lr: lr})

我的代码仍然没有按预期工作,但它至少在尝试优化某些东西,所以这就是进步!

最新更新