我使用TF2进行超参数优化。例如,我定义了一个学习率范围lr= [0.0001, 0.001, 0.01]
传递到Trainer
函数,其中包括一个自定义的训练循环(使用Gradientape
)。然而,当我使用@tf.function
时,我遇到了错误。我的培训结构是这样的:
def Trainer(lr):
# Define the optimizer
optim = tf.keras.optimizers.experimental.Nadam(learning_rate=lr)
train_dataset, test_dataset, sample_size = dataset_load(arg)
# define model
model_config = {arg}
net = Mymodel(model_config)
step = 0
with tqdm.tqdm(total=max_step, leave=True, desc='Training') as pbar:
while step < max_step:
for signal in train_dataset:
# Calculate loss
loss = train_batch(signal and other parameter)
step += 1
pbar.update()
pbar.set_postfix(
{'loss': loss.numpy(),
'step': step})
train_batch
函数为:
@tf.function
def train_batch(signal, arg...):
with tf.GradientTape() as tape:
tape.watch(model.trainable_variables)
loss = compute_loss([signal], model)
grad = tape.gradient(loss, model.trainable_variables,
unconnected_gradients=tf.UnconnectedGradients.ZERO)
optim.apply_gradients(
zip(grad, model.trainable_variables))
del grad
return loss
对于外部循环,我定义了lr
,然后使用:对于lr中的lr_current:教练(lr)
程序正常执行第一个lr_current
。但是当外部for循环运行到lr_current
的第二个值时,出现错误:
ValueError: tf。函数只支持单例tf。创建的变量第一个电话。确保tf。变量只创建一次或创建于tf.function之外。
我不明白为什么会出现这个错误。我认为这与Trainer
函数的for循环有关。在完成培训时,我也尝试过del net
,但它不起作用。当我删除@tf.function
时,程序在所有lr_current
上正常运行。
我已经上传了一个最小的可复制的例子到Colab。有人能帮我一下吗?提前感谢!
我已经完成了你的工作,复制这段代码并运行到你的笔记本上。
def Trainer():
# loss_func = tf.keras.losses.RootMeanSquaredError()
train_dataset, test_dataset, sample_size = dataset_load(time_len=100, batch_size=16)
epoch = 0
step = 0
with tqdm.tqdm(total=10, leave=True, desc='Training') as pbar:
while epoch < 10:
for signal in train_dataset:
obs_signal, obs_mask, impute_mask = genmask(signal=signal, missing_ratio=0.2,
missing_type='rm')
# Calculate loss
loss = train_batch(signal=obs_signal, obs_mask=obs_mask,
impute_mask=impute_mask)
step+=1
pbar.set_postfix(
{'loss': loss.numpy(),
'step': step,
'epoch': epoch})
epoch += 1
pbar.update()
def compute_loss(signal_mask: List, diff_params):
obs_mask = signal_mask[1]
impute_mask = signal_mask[2]
# [B, T], [B, T]
epsilon_theta, eps = diffusion(signal_mask, diff_params)
# MSE loss
target_mask = obs_mask - impute_mask
residual = (epsilon_theta - eps) * target_mask
loss = tf.reduce_sum(residual**2)/ (tf.reduce_sum(target_mask)
if tf.reduce_sum(target_mask)>0 else 1.0)
return loss
def diffusion(signal_mask: List, diff_params, eps=None):
assert len(signal_mask) == 3
signal = signal_mask[0]
cond_mask = signal_mask[2]
B, L, C = signal.shape[0], signal.shape[1], signal.shape[2] # B is batchsize, C=1, L is signal length
_dh = diff_params
T, Alpha_bar = _dh["T"], _dh["Alpha_bar"]
timesteps = tf.random.uniform(
shape=[B, 1, 1], minval=0, maxval=T, dtype=tf.int32) # [B], randomly sample diffusion steps from 1~T
if eps is None:
eps = tf.random.normal(tf.shape(signal)) # random noise
extracted_alpha = tf.gather(Alpha_bar, timesteps)
transformed_X = tf.sqrt(extracted_alpha) * signal + tf.sqrt(
1 - extracted_alpha) * eps # compute x_t from q(x_t|x_0)
timesteps = tf.cast(timesteps, tf.float32)
total_input = tf.stack([cond_mask * signal,
(1 - cond_mask) * transformed_X], axis=-1) # B, L, K, 2
obser_tp = tf.range(signal.shape[1])
epsilon_theta = net(
(total_input, obser_tp, cond_mask,
tf.squeeze(timesteps, axis=-1))) # predict epsilon according to epsilon_theta
return epsilon_theta, eps
def dataset_load(time_len, batch_size):
train_data = np.random.randn(batch_size*10, time_len, 10)
test_data = np.random.randn(batch_size, time_len, 10)
shuffle_size_train = train_data.shape[0]
train_dataset = tf.data.Dataset.
from_tensor_slices(train_data).shuffle(shuffle_size_train)
.batch(batch_size, drop_remainder=True)
test_dataset = tf.convert_to_tensor(test_data)
L = train_data.shape[-2]
K = train_data.shape[-1]
return (train_dataset, test_dataset, [L, K])
def genmask(signal: tf.Tensor, missing_ratio, missing_type):
"""Generate the mask
Returns:
observed_values (tf.Tensor): [B, T, K], [B, T, K], multivariate time series with K features
observed_masks (tf.Tensor): [B, T, K], mask for observation points
impute_mask (tf.Tensor): [B, T, K], mmask for imputation target
"""
miss_ratio = missing_ratio
observed_values = signal.numpy().astype(np.single)
observed_mask = ~np.isnan(observed_values)
rand_for_mask = np.random.rand(*observed_mask.shape) * observed_mask
rand_for_mask = rand_for_mask.reshape(len(rand_for_mask), -1) # B, L*K
for i in range(len(observed_mask)): # Loop for Batch
sample_ratio = np.random.rand() if not missing_ratio else missing_ratio # missing ratio
num_observed = observed_mask[i].sum()
num_masked = round(num_observed * sample_ratio)
rand_for_mask[i][np.argpartition(rand_for_mask[i], -num_masked)[-num_masked:]] = -1
gt_masks = (rand_for_mask > 0).reshape(observed_mask.shape).astype(np.single)
observed_mask = observed_mask.astype(np.single)
return observed_values, observed_mask, gt_masks
@tf.function
def train_batch(signal, obs_mask, impute_mask):
"""Warpped training on a batch using static graph.
Args:
signal (tf.Tensor): [B, T, K], multivariate time series with K features
obs_mask (tf.Tensor): [B, T, K], mask for observation points
impute_mask (tf.Tensor): [B, T, K], mask for imputation target
Returns:
loss (float): average loss function of on a batch
"""
with tf.GradientTape() as tape:
tape.watch(net.trainable_variables)
loss = compute_loss([signal, obs_mask, impute_mask],
diffusion_hyperparams)
grad = tape.gradient(loss, net.trainable_variables,
unconnected_gradients=tf.UnconnectedGradients.ZERO)
optim.apply_gradients(
zip(grad, net.trainable_variables))
del grad
return loss
lr = [0.001, 0.002, 0.01]
for lr_iter in lr:
optim = tf.keras.optimizers.experimental.Nadam(lr_iter)
model_config = { "res_channels": 64}
net = mymodel(model_config)
diffusion_hyperparams = calc_diffusion_hyperparams(T=50, beta_0=0.0001, beta_T=0.5, strategy="quadratic")
Trainer()
tf.keras.backend.clear_session()
del net
笔记本链接
https://colab.research.google.com/drive/1uh3-q3hM4obKLbh93sfT25zoUbK4jfUJ?usp=sharing