Tensorflow: GradientTape给出NaN的梯度



我一直在尝试用AdaIN训练自定义样式转移网络。我现在面临的唯一问题是我得到的梯度都是NaN值,从第一个历元开始。当前使用TF 2.6.1.

这是自定义训练循环和损失函数:

def _compute_mean_std(self, feats : tf.Tensor, eps=1e-8):
"""
feats: Features should be in shape N x H x W x C
"""
mean = tf.math.reduce_mean(feats, axis=[1,2], keepdims=True)
std = tf.math.reduce_std(feats, axis=[1,2], keepdims=True) + eps
return mean, std
def criterion(self, stylized_img : tf.Tensor, style_img : tf.Tensor, t : tf.Tensor):
stylized_content_feats = self.model.encode(stylized_img)
stylized_feats = self.model.encode(stylized_img, return_all=True)
style_feats = self.model.encode(style_img, return_all=True)
content_loss = self.mse_loss(t, stylized_content_feats)
style_loss = 0
for f1, f2 in zip(stylized_feats, style_feats):
m1, s1 = self._compute_mean_std(f1)
m2, s2 = self._compute_mean_std(f2)
style_loss += self.mse_loss(m1, m2) + self.mse_loss(s1, s2)
return content_loss + self.style_weight * style_loss
def train(self):
step = 0
while step < self.num_iter:
content_batch = self.content_iter.get_next()
if content_batch.shape[0] != self.batch_size:
content_batch = self.content_iter.get_next()
style_batch = self.style_iter.get_next()
if style_batch.shape[0] != self.batch_size:
style_batch = self.style_iter.get_next()
with tf.GradientTape() as tape:
stylized_imgs, t = self.model(dict(content_imgs=content_batch, style_imgs=style_batch, alpha=1.0))
loss = self.criterion(stylized_imgs, style_batch, t)
gradients = tape.gradient(loss, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(gradients, self.model.trainable_weights))
# log and save every 200 batches
if step % 200 == 0:
print(f'Training loss (for one batch) at step {step}: {loss}')
print(f'Seen so far: {(step+1)*self.batch_size} samples')
self.model.save_weights(f'./checkpoints/adain_e{step}.ckpt')
step += 1
print("Finished training...")
self.model.save_weights('saved_model/adain_weights.h5')

我不明白它为什么这样做。当_compute_mean_std以NxCxHxW格式计算mean/std时,它不会抛出错误,这也不是我想要的。在尝试以正确的形状计算它时,添加转置也会导致此问题。

这是可能的

[示例]:

import os
from os.path import exists
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
None
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
print(physical_devices)
print(config)
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Variables
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
num_iter = 1000
train_generator_batch_size = 1
batch_size = 1
WIDTH = 256
HEIGHT = 256
CHANNEL = 3
checkpoint_path = "F:\models\checkpoint\" + os.path.basename(__file__).split('.')[0] + "\TF_DataSets_01.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)
if not exists(checkpoint_dir) : 
os.mkdir(checkpoint_dir)
print("Create directory: " + checkpoint_dir)
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Definition / Class
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def create_image_generator( ):
variables = pd.read_excel('F:\temp\Python\excel\Book 7.xlsx', index_col=None, header=[0], dtype=str)

train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
rescale=1./255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,

validation_split=0.2,

)
train_image_ds = train_generator.flow_from_dataframe(
dataframe = variables,
directory=None,
x_col= 'Image',
y_col= 'Label',
weight_col=None,
target_size=( WIDTH, HEIGHT ),
color_mode='rgb',
classes=None,
class_mode='categorical',           ####
batch_size=train_generator_batch_size,
shuffle=True,
seed=None,
save_to_dir=None,
save_prefix='',
save_format='png',
subset=None,
interpolation='nearest',
validate_filenames=True,
)
return train_image_ds
class gradient_tape_optimizer( ):
def __init__ ( self, model, num_iter, content_iter, batch_size ):

self.num_iter = num_iter
self.content_iter = content_iter
self.style_iter = content_iter 
self.batch_size = batch_size
self.model = model
self.loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=False, 
reduction=tf.keras.losses.Reduction.AUTO, 
name='sparse_categorical_crossentropy' )

self.optimizer = tf.keras.optimizers.Nadam( learning_rate=0.00001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name='Nadam' )
def _compute_mean_std( self, feats : tf.Tensor, eps=1e-8 ):
"""
feats: Features should be in shape N x H x W x C
"""
mean = tf.math.reduce_mean(feats, axis=[1,2], keepdims=True)
std = tf.math.reduce_std(feats, axis=[1,2], keepdims=True) + eps
return mean, std
def criterion( self, stylized_img : tf.Tensor, style_img : tf.Tensor, t : tf.Tensor ):
stylized_content_feats = self.model.encode(stylized_img)
stylized_feats = self.model.encode(stylized_img, return_all=True)
style_feats = self.model.encode(style_img, return_all=True)
content_loss = self.mse_loss(t, stylized_content_feats)
style_loss = 0
for f1, f2 in zip(stylized_feats, style_feats):
m1, s1 = self._compute_mean_std(f1)
m2, s2 = self._compute_mean_std(f2)
style_loss += self.mse_loss(m1, m2) + self.mse_loss(s1, s2)
return content_loss + self.style_weight * style_loss
def train( self ):
step = 0
while step < self.num_iter:
content_batch = self.content_iter.get_next()
if content_batch[0].shape[1] != self.batch_size:
content_batch = self.content_iter.get_next()
style_batch = self.style_iter.get_next()

if style_batch[0].shape[1] != self.batch_size:
style_batch = self.style_iter.get_next()


current_label = tf.constant( content_batch[1], shape=( 2, 1 ) ).numpy()
loss_value = tf.Variable( 10.0 )

with tf.GradientTape() as tape:

result = self.model( inputs=tf.constant( content_batch[0], shape=( 1, WIDTH, HEIGHT, CHANNEL ) ) )
result = tf.constant( result, shape=( 2, 1 ) )
predict_label = tf.Variable( tf.constant( self.model.trainable_weights[len(self.model.trainable_weights) - 1], shape=( 2, 1 ) ) )
loss_value =  self.loss( result.numpy(), current_label )
loss_value =  tf.Variable( tf.constant( loss_value, shape=( 1, ) ).numpy() )
tape.watch( loss_value )

gradients = tape.gradient( loss_value, loss_value )
self.optimizer.apply_gradients(zip(gradients, self.model.trainable_weights))
# log and save every 200 batches
if step % 200 == 0:
print(f'Training loss (for one batch) at step {step}: {self.loss}')
print(f'Seen so far: {(step+1)*self.batch_size} samples')
self.model.save_weights(checkpoint_path)
step += 1
print("Finished training...")
self.model.save_weights(checkpoint_path)
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Dataset
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
variables = pd.read_excel('F:\temp\Python\excel\Book 7.xlsx', index_col=None, header=[0], dtype=str)
train_image_ds = tf.data.Dataset.from_generator(
create_image_generator,
output_types=None,
output_shapes=None,
args=None,
output_signature=(
tf.TensorSpec(shape=( 1, WIDTH, HEIGHT, CHANNEL ), dtype=tf.float32, name=None), tf.TensorSpec(shape=(1, 2), dtype=tf.float32, name=None),
),

name='train_image_ds'
)
train_image_ds = train_image_ds.batch( 1 )
iterator = iter( train_image_ds )
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Initialize
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model = tf.keras.models.Sequential([
tf.keras.layers.InputLayer(input_shape=( WIDTH, HEIGHT, CHANNEL )),
tf.keras.layers.Normalization(mean=3., variance=2.),
tf.keras.layers.Normalization(mean=4., variance=6.),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Reshape((128, 127 * 127)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96, return_sequences=True, return_state=False)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(192, activation='relu'),
tf.keras.layers.Dense(2),
])
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Optimizer
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
optimizer = tf.keras.optimizers.Nadam(
learning_rate=0.00001, beta_1=0.9, beta_2=0.999, epsilon=0.0000001,
name='Nadam'
)
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Loss Fn
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""                               
lossfn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=False,
reduction=tf.keras.losses.Reduction.AUTO,
name='sparse_categorical_crossentropy'
)
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Summary
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model.compile(optimizer=optimizer, loss=lossfn, metrics=['accuracy'])
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Training
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
gradient_tape_optimizer = gradient_tape_optimizer( model, num_iter, iterator, batch_size )
result = gradient_tape_optimizer.train()
input( '...' )

最新更新