函数tf.test.compute_gradient显示了与扰动增量大小有关的意外行为



我试图使用tf.test.compute_gradient来检查tf是否可以正确区分我的(相当复杂的)自定义损失函数。然而,tf.test.compute_gradient表现出意想不到的行为,与delta(dx)的扰动大小有关。

对于一维函数f(x),tf.test.compute_gradient的数值雅可比矩阵被计算为(f(x + dx) - f(x - dx)) / (2 dx),我们期望数值雅可比矩阵收敛于E(dx) = f'''(x) dx^2 / 6 -> 0dx -> 0的真值。虽然我还没有对更高的维度进行演算,但我希望行为大致相同。

然而,E(dx) -> + inf似乎与dx -> 0,即使是一个非常简单的函数。

如果数值雅可比矩阵在应该收敛时不收敛,如何测试自定义函数的梯度,这可能在tf自动计算梯度的方式中有错误?如何从自动计算的tf梯度中区分舍入误差?我应该选择哪个值的delta?它是否随问题的维度数(即。(如果有tf.reduce_sum并且错误累积)。

在下面的代码示例中,我对Normal律的对数进行了测试。由于f'''(x)理论上为零,数值雅可比矩阵应该是精确的,然而,max(Jth - Jnu)随着delta的减小而增大。

对于n的任意维度,dtype=float32dtype=float64都会出现这个问题。

尽管n = 1eval_jac_at = 1的误差最小,但对于更硬的问题(eval_jac_at>>1)或高维问题(n>>1)

import tensorflow as tf
import tensorflow.math as tfm
from tensorflow_probability import distributions as tfd
import numpy as np
import matplotlib.pyplot as plt
"""
Comparing theoretical and numerical gradients for a simple function 
using tf.test.compute_gradient as a function of the perturbation delta 
The studied function is the log of multivariate normal law:
log(N(loc, scale)) = \sum_i (x_i - loc_i)**2 / (2 * scale_i**2) + cst
"""
# number of dimensions
n = 1
# data type
dtype = "float32"
# evaluate jacobian at (in every dimension)
eval_jac_at = 1
loc = tf.cast(0.0, dtype)
# .5 < scale < 1.5 
scale = tf.cast(0.5 + np.random.rand(n), dtype)

norm_dist = tfd.Normal(loc=loc, scale=scale)

def f(x):
"""
Explicit definition
"""
return tfm.reduce_sum(-tfm.pow(x - loc, 2) / (2 * scale ** 2))

def g(x):
"""
Using tfp.distributions
"""
return tfm.reduce_sum(norm_dist.log_prob(x))

def compute_err_gradient(f, delta):
"""
Returns max(|Jnu - Jth|) for a function f and perturbation delta.
Jacobians evaluated at eval_jac_at * [1, ..., 1].
"""
Jth, Jnu = tf.test.compute_gradient(f, [eval_jac_at * tf.ones(n, dtype)], delta)
return np.max(np.abs(Jnu[0] - Jth[0]))

# Compute a series of delta = 1 / 2**i
deltas = 1 / np.power(2, np.arange(5, 14))
# Compute error on Jacobian for each value of delta
err_f = np.array([compute_err_gradient(f, delta) for delta in deltas])
err_g = np.array([compute_err_gradient(g, delta) for delta in deltas])
# Print and plot results
print()
print(f"{deltas=}")
print(f"{err_f=}")
print(f"{err_g=}")
print()
fig, ax = plt.subplots()
ax.loglog(deltas, err_f, label="f")
ax.loglog(deltas, err_g, label="g")
ax.set_xlabel("delta")
ax.set_ylabel("Jth - Jnu")
ax.legend()
ax.grid()
plt.show()

输出:

deltas=array([0.03125   , 0.015625  , 0.0078125 , 0.00390625, 0.00195312,
0.00097656, 0.00048828, 0.00024414, 0.00012207])
err_f=array([1.19209290e-07, 1.19209290e-07, 1.19209290e-07, 1.19209290e-07,
3.69548798e-06, 3.93390656e-06, 1.91926956e-05, 1.13248825e-05,
7.23600388e-05], dtype=float32)
err_g=array([1.19209290e-07, 3.69548798e-06, 3.93390656e-06, 3.93390656e-06,
1.13248825e-05, 1.13248825e-05, 4.97102737e-05, 4.97102737e-05,
4.97102737e-05], dtype=float32)

不用担心,使用tf.GradientTape是获得具有理论值的自动微分的最常见方法。您可以始终使用tf.GradientTape作为您的训练代码。就像:

import tensorflow as tf
from tensorflow_probability import distributions as tfd
import numpy as np
"""
Comparing theoretical and numerical gradients for a simple function 
using tf.test.compute_gradient as a function of the perturbation delta 
The studied function is the log of multivariate normal law:
log(N(loc, scale)) = \sum_i (x_i - loc_i)**2 / (2 * scale_i**2) + cst
"""
tf.keras.utils.set_random_seed(0)
# number of dimensions
n = 1
# data type
dtype = "float32"
# evaluate jacobian at (in every dimension)
eval_jac_at = 1
loc = tf.cast(0.0, dtype)
# .5 < scale < 1.5 
scale = tf.cast(0.5 + np.random.rand(n), dtype)
norm_dist = tfd.Normal(loc=loc, scale=scale)
def f(x):
"""
Explicit definition
"""
return tf.reduce_sum(-tf.pow(x - loc, 2) / (2 * scale ** 2))
def g(x):
"""
Using tfp.distributions
"""
return tf.reduce_sum(norm_dist.log_prob(x))
class MyTest(tf.test.TestCase):
def test_gradient_of_test_func(self,test_func,inputs,delta):
theoretical, numerical = tf.test.compute_gradient(test_func,inputs,delta)
with tf.GradientTape(persistent=True) as tape:
tape.watch(inputs)
y = test_func(inputs)
auto_diff =  (tape.jacobian(y,inputs),)
# self.assertAllClose(theoretical, numerical)
tf.print(theoretical)
tf.print(auto_diff)
self.assertAllClose(theoretical, auto_diff)
my_test = MyTest()
deltas = tf.convert_to_tensor(1 / np.power(2, np.arange(5, 14)))
for delta in deltas:
my_test.test_gradient_of_test_func(f,[eval_jac_at * tf.ones(n, dtype)], delta)
print("*"*20)
for delta in deltas:
my_test.test_gradient_of_test_func(g,[eval_jac_at * tf.ones(n, dtype)], delta)

和输出:

(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
********************
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)
(array([[-0.9090829]], dtype=float32),)
([[-0.90908289]],)

最新更新