我想开发一个分辨率为1024x1024的DCGAN。为此,我需要使用多个gpu,否则可能会花费太多时间。我参考了https://www.tensorflow.org/guide/distributed_training文档
中的介绍在脚本的顶部,我使用
strategy = tf.distribute.MirroredStrategy()
然后在DCGAN中我使用
with strategy.scope():
我得到的错误是:
ValueError:Trying to create optimizer slot variable under the scope for tf.distribute.Strategy, which is different from the scope used for the original variable. Make sure the slot variables are created under the same strategy scope. This may happen if you're restoring from a checkpoint outside the scope.
下面是我的代码:
strategy = tf.distribute.MirroredStrategy()
dataset = keras.preprocessing.image_dataset_from_directory(
"test2", label_mode=None, image_size=(1024, 1024), batch_size=4)
dataset = dataset.map(lambda x: x / 255.0)
discriminator = keras.Sequential(
[
keras.Input(shape=(1024, 1024, 3)),
layers.Conv2D(8, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2D(8, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2D(16, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2D(16, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2D(32, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2D(32, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Flatten(),
layers.Dropout(0.2),
layers.Dense(1, activation="sigmoid"),
],
name="discriminator",
)
discriminator.summary()
latent_dim = 1024
generator = keras.Sequential(
[
keras.Input(shape=(latent_dim,)),
layers.Dense(16 * 16 * 32),
layers.Reshape((16, 16, 32)),
layers.Conv2DTranspose(32, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2DTranspose(32, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2DTranspose(32, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2DTranspose(32, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2DTranspose(32, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2DTranspose(32, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2D(3, kernel_size=5, padding="same", activation="sigmoid"),
],
name="generator",
)
generator.summary()
class GAN(keras.Model):
def __init__(self, strategy, discriminator, generator, latent_dim):
super(GAN, self).__init__()
self.discriminator = discriminator
self.generator = generator
self.latent_dim = latent_dim
self.global_batchsize = 32
self.strategy = strategy
self.batchsize_per_replica = int(self.global_batchsize/self.strategy.num_replicas_in_sync)
def loss_fn(self, labels, predictions):
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True,
reduction=tf.keras.losses.Reduction.NONE)
return loss_fn(labels, predictions)
def compile(self, d_optimizer, g_optimizer):
super(GAN, self).compile()
self.d_optimizer = d_optimizer
self.g_optimizer = g_optimizer
self.d_loss_metric = keras.metrics.Mean(name="d_loss")
self.g_loss_metric = keras.metrics.Mean(name="g_loss")
def metrics(self):
return [self.d_loss_metric, self.g_loss_metric]
def disc_loss(self, real_output, fake_output):
real_loss = self.loss_fn(tf.ones_like(real_output), real_output)
fake_loss = self.loss_fn(tf.zeros_like(fake_output), fake_output)
total_loss = real_loss + fake_loss
total_loss = total_loss/self.global_batchsize
return total_loss
def gen_loss(self, fake_output):
gen_loss = self.loss_fn(tf.ones_like(fake_output), fake_output)
gen_loss = gen_loss / self.global_batchsize
return gen_loss
def distribute_trainstep(self, dist_dataset):
per_replica_g_losses, per_replica_d_losses = self.strategy.experimental_run_v2(self.train_step,dist_dataset)
total_g_loss = self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_g_losses,axis=0)
total_d_loss = self.strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_d_losses, axis=0)
return total_g_loss, total_d_loss
def train_step(self, real_images):
batch_size = tf.shape(real_images)[0]
random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
generated_images = self.generator(random_latent_vectors)
combined_images = tf.concat([generated_images, real_images], axis=0)
labels = tf.concat(
[tf.ones((batch_size, 1)), tf.zeros((batch_size, 1))], axis=0
)
labels += 0.05 * tf.random.uniform(tf.shape(labels))
noise = tf.random.normal(shape=[tf.shape(real_images)[0], self.latent_dim])
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
generated_imgs = self.generator(noise, training=True)
real_output = self.discriminator(real_images, training=True)
fake_output = self.discriminator(generated_imgs, training=True)
d_loss = self.disc_loss(real_output, fake_output)
g_loss = self.gen_loss(fake_output)
G_grads = gen_tape.gradient(g_loss, self.generator.trainable_variables)
D_grads = disc_tape.gradient(d_loss, self.discriminator.trainable_variables)
self.g_optimizer.apply_gradients(zip(G_grads, self.generator.trainable_variables))
self.d_optimizer.apply_gradients(zip(D_grads, self.discriminator.trainable_variables))
with tf.GradientTape() as gen_tape:
generated_imgs = self.generator_model(noise, training=True)
fake_output = self.discriminator(generated_imgs, training=True)
g_loss = self.gen_loss(fake_output)
G_grads = gen_tape.gradient(g_loss, self.generator_model.trainable_variables)
self.g_optimizer.apply_gradients(zip(G_grads, self.generator.trainable_variables))
return g_loss, d_loss
class GANMonitor(keras.callbacks.Callback):
def __init__(self, num_img=6, latent_dim=32):
self.num_img = num_img
self.latent_dim = latent_dim
def on_epoch_end(self, epoch, logs=None):
random_latent_vectors = tf.random.normal(shape=(self.num_img, self.latent_dim))
generated_images = self.model.generator(random_latent_vectors)
generated_images *= 255
generated_images.numpy()
for i in range(self.num_img):
img = keras.preprocessing.image.array_to_img(generated_images[i])
if epoch %50 ==0:
img.save("./1024/generated_img_%03d_%d.png" % (epoch, i))
epochs = 5000
with strategy.scope():
gan = GAN(strategy, discriminator=discriminator, generator=generator, latent_dim=latent_dim)
gan.compile(
d_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
g_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
)
gan.fit(
dataset, epochs=epochs, callbacks=[GANMonitor(num_img=60, latent_dim=latent_dim)]
)
错误如下
Epoch 1/5000
/home/kuo/.local/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:1082: UserWarning: "`binary_crossentropy` received `from_logits=True`, but the `output` argument was produced by a sigmoid or softmax activation and thus does not represent logits. Was this intended?"
return dispatch_target(*args, **kwargs)
Traceback (most recent call last):
File "1024.py", line 253, in <module>
gan.fit(
File "/home/kuo/.local/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/home/kuo/.local/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py", line 1147, in autograph_handler
raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
File "/home/kuo/.local/lib/python3.8/site-packages/keras/engine/training.py", line 1021, in train_function *
return step_function(self, iterator)
File "/home/kuo/.local/lib/python3.8/site-packages/keras/engine/training.py", line 1010, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.8/dist-packages/six.py", line 703, in reraise
raise value
File "/home/kuo/.local/lib/python3.8/site-packages/keras/engine/training.py", line 1000, in run_step **
outputs = model.train_step(data)
File "1024.py", line 179, in train_step
self.g_optimizer.apply_gradients(zip(G_grads, self.generator.trainable_variables))
File "/home/kuo/.local/lib/python3.8/site-packages/keras/optimizer_v2/optimizer_v2.py", line 639, in apply_gradients
self._create_all_weights(var_list)
File "/home/kuo/.local/lib/python3.8/site-packages/keras/optimizer_v2/optimizer_v2.py", line 825, in _create_all_weights
self._create_slots(var_list)
File "/home/kuo/.local/lib/python3.8/site-packages/keras/optimizer_v2/adam.py", line 117, in _create_slots
self.add_slot(var, 'm')
File "/home/kuo/.local/lib/python3.8/site-packages/keras/optimizer_v2/optimizer_v2.py", line 902, in add_slot
raise ValueError(
ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy (<tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7f72f39c0430>), which is different from the scope used for the original variable (<tf.Variable 'dense_1/kernel:0' shape=(1024, 8192) dtype=float32, numpy=
array([[-0.00106893, 0.01506512, -0.01771315, ..., -0.01528796,
-0.02354955, -0.0135217 ],
[-0.01760183, -0.02044552, 0.00945723, ..., -0.02140231,
0.01164402, 0.01851213],
[ 0.00233763, -0.0196434 , 0.01152603, ..., -0.02139488,
0.0125667 , 0.0251492 ],
...,
[ 0.00782686, 0.00941393, 0.00423452, ..., -0.0052203 ,
-0.02194414, -0.0167138 ],
[ 0.02420759, -0.02258933, 0.01125678, ..., -0.00626962,
0.00758442, 0.0015665 ],
[-0.00925244, -0.02154037, -0.0209455 , ..., -0.01146874,
0.00285936, 0.01914702]], dtype=float32)>). Make sure the slot variables are created under the same strategy scope. This may happen if you're restoring from a checkpoint outside the scope.
使用Keras函数API而不是顺序API来指定网络体系结构为我解决了这个问题。参见https://keras.io/guides/functional_api。