如何在TF 2.2 Eager中获取渐变?


model.total_loss

已在Eager 中弃用,因此下面不再有效 - 如何获取渐变?


适用于 TF 2.1/2.0

import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
ipt = Input((16,))
out = Dense(16)(ipt)
model = Model(ipt, out)
model.compile('adam', 'mse')
x = y = np.random.randn(32, 16)
model.train_on_batch(x, y)
grad_tensors = model.optimizer.get_gradients(model.total_loss, model.trainable_weights)

注意:替代方案应该能够设置learning_phase标志,并且(首选,不是必需的(句柄sample_weight。以上通过K.function(..., outputs=grad_tensors)实现此目的。

网络结构在 2.2 中发生了变化,使得某些模型属性或方法无法访问。下面适用于 Graph 和 Eager,并经过测试以提供可重现的结果。急切案例仅适用于可训练的权重,不适用于图层输出;我很快就会添加一个更完整的版本,涵盖 See RNN 的输出。

Eager方法重用 Eager 训练循环代码,确保与内部梯度计算的一致性。

更新:在这里完成方法;支持所有后端(TF 1,2,Eager,Graph,kerastf.keras(,以及权重和输出。


方法

import numpy as np
import tensorflow as stf
from tensorflow.keras import backend as K
from tensorflow.python.distribute import parameter_server_strategy
from tensorflow.python.keras.engine import data_adapter
from tensorflow.python.keras.mixed_precision.experimental import (
loss_scale_optimizer as lso)

def _get_grads_graph(model, x, y, params, sample_weight=None, learning_phase=0):
sample_weight = sample_weight or np.ones(len(x))
outputs = model.optimizer.get_gradients(model.total_loss, params)
inputs  = (model.inputs + model._feed_targets + model._feed_sample_weights
+ [K.learning_phase()])
grads_fn = K.function(inputs, outputs)
gradients = grads_fn([x, y, sample_weight, learning_phase])
return gradients
def _get_grads_eager(model, x, y, params, sample_weight=None, learning_phase=0):
def _process_input_data(x, y, sample_weight, model):
iterator = data_adapter.single_batch_iterator(model.distribute_strategy,
x, y, sample_weight,
class_weight=None)
data = next(iterator)
data = data_adapter.expand_1d(data)
x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
return x, y, sample_weight
def _clip_scale_grads(strategy, tape, optimizer, loss, params):
with tape:
if isinstance(optimizer, lso.LossScaleOptimizer):
loss = optimizer.get_scaled_loss(loss)
gradients = tape.gradient(loss, params)
aggregate_grads_outside_optimizer = (
optimizer._HAS_AGGREGATE_GRAD and not isinstance(
strategy.extended,
parameter_server_strategy.ParameterServerStrategyExtended))
if aggregate_grads_outside_optimizer:
gradients = optimizer._aggregate_gradients(zip(gradients, params))
if isinstance(optimizer, lso.LossScaleOptimizer):
gradients = optimizer.get_unscaled_gradients(gradients)
gradients = optimizer._clip_gradients(gradients)
return gradients
x, y, sample_weight = _process_input_data(x, y, sample_weight, model)
with tf.GradientTape() as tape:
y_pred = model(x, training=bool(learning_phase))
loss = model.compiled_loss(y, y_pred, sample_weight,
regularization_losses=model.losses)
gradients = _clip_scale_grads(model.distribute_strategy, tape,
model.optimizer, loss, params)
gradients = K.batch_get_value(gradients)
return gradients
def get_gradients(model, x, y, params, sample_weight=None, learning_phase=0,
evaluate=True):
if tf.executing_eagerly():
return _get_grads_eager(model, x, y, params, sample_weight,
learning_phase)
else:
return _get_grads_graph(model, x, y, params, sample_weight,
learning_phase)

测试

import numpy as np
np.random.seed(1)
import random
random.seed(2)
import tensorflow as tf
tf.compat.v1.set_random_seed(3)
tf.random.set_seed(4)
# tf.compat.v1.disable_eager_execution()
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import GlorotUniform

ipt = Input((4,))
out = Dense(4, kernel_initializer=GlorotUniform(seed=0))(ipt)
model = Model(ipt, out)
model.compile('adam', 'mse')
x = y = np.random.randn(32, 4)
model.train_on_batch(x, y)
print(model.get_weights())
grads = get_gradients(model, x, y, model.trainable_weights)
print(grads)
# WEIGHTS (Eager & Graph)
[array([[-0.4995359 ,  0.3558198 ,  0.518725  ,  0.4680259 ],
[-0.19397011,  0.6424813 ,  0.5327964 , -0.52391374],
[ 0.6039545 ,  0.07058681, -0.62931913, -0.6724267 ],
[ 0.42698476, -0.52317786, -0.2453942 ,  0.03615759]], dtype=float32),
array([-0.00100001,  0.00099961,  0.00100002,  0.00100001], dtype=float32)]
# GRADS (Eager & Graph)
[array([[-0.5818436 ,  0.22703086,  0.2980485 ,  0.42571294],
[ 0.18901172, -0.20659731,  0.08305292, -0.31698108],
[ 0.41603914, -0.01972354, -0.72125435, -0.34481353],
[ 0.38650095, -0.31618145, -0.17637177, -0.55846536]], dtype=float32),
array([ 0.17147431, -0.00683564, -0.31096804, -0.14086047], dtype=float32)]

这是你要找的吗?

ipt = Input((16,))
out = Dense(16)(ipt)
model = Model(ipt, out)
model.compile('adam', 'mse')
x = y = tf.constant(np.random.randn(32, 16))
model.train_on_batch(x, y)
with tf.GradientTape() as tape:
# Doing the computation in the context of the gradient tape
# For example computing loss
pred = model(x)
loss = tf.metrics.MSE(y,pred)
# Getting the gradient of weight w.r.t loss 
grad = tape.gradient(loss, model.trainable_weights) 
grad

最新更新