我使用Francois Chollet提供的模板在Keras中实现了一个Transformer编码器。在我训练模型之后,我使用model.save
保存它,但是当我再次加载它进行推理时,我发现权重似乎又是随机的,因此我的模型失去了所有的推理能力。
我在Stack Overflow和GitHub上看过类似的问题,并应用了以下建议,但我仍然得到同样的问题:
- 在类上使用
@tf.keras.utils.register_keras_serializable()
装饰器 - 确保
**kwargs
在init call 中 - 确保自定义层有
get_config
和from_config
方法。 - 使用
custom_object_scope
加载模型
下面是一个最低限度可复制的示例,用于复制该问题。我如何改变它,使模型权重保存正确?
import numpy as np
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers
from keras.models import load_model
from keras.utils import custom_object_scope
@tf.keras.utils.register_keras_serializable()
class TransformerEncoder(layers.Layer):
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.dense_dim = dense_dim
self.num_heads = num_heads
self.attention = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=embed_dim)
self.dense_proj = keras.Sequential(
[
layers.Dense(dense_dim, activation="relu"),
layers.Dense(embed_dim),
]
)
self.layernorm_1 = layers.LayerNormalization()
self.layernorm_2 = layers.LayerNormalization()
def call(self, inputs, mask=None):
if mask is not None:
mask = mask[:, tf.newaxis, :]
attention_output = self.attention(
inputs, inputs, attention_mask=mask)
proj_input = self.layernorm_1(inputs + attention_output)
proj_output = self.dense_proj(proj_input)
return self.layernorm_2(proj_input + proj_output)
def get_config(self):
config = super().get_config()
config.update({
"embed_dim": self.embed_dim,
"num_heads": self.num_heads,
"dense_dim": self.dense_dim,
})
return config
@classmethod
def from_config(cls, config):
return cls(**config)
# Create simple model:
encoder = TransformerEncoder(embed_dim=2, dense_dim=2, num_heads=1)
inputs = keras.Input(shape=(2, 2), batch_size=None, name="test_inputs")
x = encoder(inputs)
x = layers.Flatten()(x)
outputs = layers.Dense(1, activation="linear")(x)
model = keras.Model(inputs, outputs)
# Fit the model and save it:
np.random.seed(42)
X = np.random.rand(10, 2, 2)
y = np.ones(10)
model.compile(optimizer=keras.optimizers.Adam(), loss="mean_squared_error")
model.fit(X, y, epochs=2, batch_size=1)
model.save("./test_model")
# Load the saved model:
with custom_object_scope({
'TransformerEncoder': TransformerEncoder
}):
loaded_model = load_model("./test_model")
print(model.weights[0].numpy())
print(loaded_model.weights[0].numpy())
权重被保存(您可以在加载模型后使用load_weights
加载它们)。问题是你在__init__
中创建了新的图层。您需要从它们的配置中重新创建它们,例如:
class TransformerEncoder(layers.Layer):
def __init__(self, embed_dim, dense_dim, num_heads, attention_config=None, dense_proj_config=None, **kwargs):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.dense_dim = dense_dim
self.num_heads = num_heads
self.attention = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=embed_dim)
if attention_config is None else layers.MultiHeadAttention.from_config(attention_config)
self.dense_proj = keras.Sequential(
[
layers.Dense(dense_dim, activation="relu"),
layers.Dense(embed_dim),
]
) if dense_proj_config is None else keras.Sequential.from_config(dense_proj_config)
...
def call(self, inputs, mask=None):
...
def get_config(self):
config = super().get_config()
config.update({
"embed_dim": self.embed_dim,
"num_heads": self.num_heads,
"dense_dim": self.dense_dim,
"attention_config": self.attention.get_config(),
"dense_proj_config": self.dense_proj.get_config(),
})
return config
输出:
[[[-0.810745 -0.14727005]]
[[ 0.8542909 0.09689581]]]
[[[-0.810745 -0.14727005]]
[[ 0.8542909 0.09689581]]]
秘密就在于它是如何工作的。您可以使用model.get_weights()进行尝试,但我在layer.get_weight()中进行示例。那是因为它很容易看到。
示例:使用随机初始值的自定义层在运行几次后会导致少量随机数发生变化。
import tensorflow as tf
class MyDenseLayer(tf.keras.layers.Layer):
def __init__(self, num_outputs):
super(MyDenseLayer, self).__init__()
self.num_outputs = num_outputs
def build(self, input_shape):
""" initialize weights with randomize numbers """
min_size_init = tf.keras.initializers.RandomUniform(minval=1, maxval=5, seed=None)
self.kernel = self.add_weight(shape=[int(input_shape[-1]), self.num_outputs],
initializer = min_size_init, trainable=True)
def call(self, inputs):
return tf.matmul(inputs, self.kernel)
start = 3
limit = 33
delta = 3
# Create DATA
sample = tf.range(start, limit, delta)
sample = tf.cast( sample, dtype=tf.float32 )
# Initail, ( 10, 1 )
sample = tf.constant( sample, shape=( 10, 1 ) )
layer = MyDenseLayer(10)
data = layer(sample)
输出:call()进程初始化的同一层继续
### 1st round ###
# [array([[-0.07862139, -0.45416605, -0.53606 , 0.18597281, 0.2919714 ,
# -0.27334914, 0.60890776, -0.3856985 , 0.58052486, -0.5634572 ]], dtype=float32)]
### 2nd round ###
# [array([[ 0.5949032 , 0.05113244, -0.51997787, 0.26252705, -0.09235346,
# -0.35243294, -0.0187515 , -0.12527376, 0.22348166, 0.37051445]], dtype=float32)]
### 3rd round ###
# [array([[-0.6654639 , -0.46027896, -0.48666477, -0.23095328, 0.30391783,
# 0.21867174, -0.5405392 , -0.45399982, -0.22143698, 0.66893476]], dtype=float32)]
示例:每次告诉图层重置初始值时重新调用。
layer.build([1])
print( data )
print( layer.get_weights() )
输出:model.call()导致不同的not continue .
### 1st round ###
# [array([[ 0.73738164, 0.14095825, -0.5416008 , -0.35084447, -0.35209572,
# -0.35504425, 0.1692887 , 0.2611189 , 0.43355125, -0.3325353 ]], dtype=float32)]
### 2nd round ###
# [array([[ 0.5949032 , 0.05113244, -0.51997787, 0.26252705, -0.09235346,
# -0.35243294, -0.0187515 , -0.12527376, 0.22348166, 0.37051445]], dtype=float32)]
### 3rd round ###
# [array([[-0.6654639 , -0.46027896, -0.48666477, -0.23095328, 0.30391783,
# 0.21867174, -0.5405392 , -0.45399982, -0.22143698, 0.66893476]], dtype=float32)]
示例:我们包含了层初始化值需求,假设所有操作都以相同的初始值开始。
""" initialize weights with values ones """
min_size_init = tf.keras.initializers.Ones()
输出:每次都复制相同的结果。
### 1st round ###
# tf.Tensor(
# [[ 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
# [ 6. 6. 6. 6. 6. 6. 6. 6. 6. 6.]
# [ 9. 9. 9. 9. 9. 9. 9. 9. 9. 9.]
# [12. 12. 12. 12. 12. 12. 12. 12. 12. 12.]
# [15. 15. 15. 15. 15. 15. 15. 15. 15. 15.]
# [18. 18. 18. 18. 18. 18. 18. 18. 18. 18.]
# [21. 21. 21. 21. 21. 21. 21. 21. 21. 21.]
# [24. 24. 24. 24. 24. 24. 24. 24. 24. 24.]
# [27. 27. 27. 27. 27. 27. 27. 27. 27. 27.]
# [30. 30. 30. 30. 30. 30. 30. 30. 30. 30.]], shape=(10, 10), dtype=float32)
# [array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], dtype=float32)]
### 2nd round ###
# tf.Tensor(
# [[ 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
# [ 6. 6. 6. 6. 6. 6. 6. 6. 6. 6.]
# [ 9. 9. 9. 9. 9. 9. 9. 9. 9. 9.]
# [12. 12. 12. 12. 12. 12. 12. 12. 12. 12.]
# [15. 15. 15. 15. 15. 15. 15. 15. 15. 15.]
# [18. 18. 18. 18. 18. 18. 18. 18. 18. 18.]
# [21. 21. 21. 21. 21. 21. 21. 21. 21. 21.]
# [24. 24. 24. 24. 24. 24. 24. 24. 24. 24.]
# [27. 27. 27. 27. 27. 27. 27. 27. 27. 27.]
# [30. 30. 30. 30. 30. 30. 30. 30. 30. 30.]], shape=(10, 10), dtype=float32)
# [array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], dtype=float32)]
样本:实现
temp = tf.random.normal([10], 1, 0.2, tf.float32)
temp = np.asarray(temp) * np.asarray([ coefficient_0, coefficient_1, coefficient_2, coefficient_3, coefficient_4, coefficient_5, coefficient_6, coefficient_7, coefficient_8, coefficient_9 ])
temp = tf.nn.softmax(temp)
action = int(np.argmax(temp))
输出:所有变量为环境变量的协方差。它选择映射到游戏中目标动作的max()或min()值。我添加了一些随机值,不赢得过滤器的时间值创建的动作反馈。