不要为我的 GAN 实现获得完整的 GPU 利用率



我构建了一个GAN网络,用于预测形状的输出(40,40,6)形成形状[(40,40,4),(20,20,6)]的两个输入。

该模型实际上正在工作并且已经提供了结果,但我"只"获得了 60% 到 70% 的 GPU 利用率(由 nvidia-smi 显示(。

我的问题是,这是否是这样一个模型的内在因素,因为它必须在train_on_batch的调用之间做一些事情,或者是否有办法加快这个过程?

随机数据的极简工作示例如下所示:

import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import UpSampling3D
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda
from tensorflow.keras.optimizers import Adam

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)



# =============================================================================
# define the model    
# =============================================================================
def resBlock(X_in, num_of_features, kernel_size, scale):
x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(X_in)
x = Activation('relu')(x)
x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(x)
x = Lambda(lambda x: x * scale)(x)
X_out = Add()([X_in,x])
return X_out
class Generator(object):
def __init__(self, noise_shape):
self.noise_shape = noise_shape
self.num_of_features = 128
self.kernel_size = (3,3)
self.scale = 0.1
self.padding=8
self.hp = int(self.padding/2) # half padding
def generator(self):
# get the inputs and do upsampling
inputs_channels_A = Input((32+self.padding,32+self.padding,4),name = 'input_A')
inputs_channels_B = Input((16+self.hp,16+self.hp,6),name = 'input_B')
inputs_channels_B_upsampled = UpSampling3D(size = (2,2,1))(inputs_channels_B)
# concentrate everything
concentrated_input = concatenate([inputs_channels_A,
inputs_channels_B_upsampled],
axis=3,)
# do the first convolution
x = Conv2D(self.num_of_features,
self.kernel_size,
activation = 'relu',
padding = 'same',
kernel_initializer = 'he_normal')(concentrated_input)
# do the resBlock iterations
for resblock_index in range(6):
x = resBlock(x,self.num_of_features, self.kernel_size, self.scale)
# doing the last conv to resize it to (28,28,6)
x = Conv2D(6, (3, 3), kernel_initializer='he_uniform', padding='same')(x)
# last scipt connection
output = Add()([x,inputs_channels_B_upsampled])
# defining model
generator_model = Model(inputs = [inputs_channels_A,inputs_channels_B], outputs = output)
return generator_model
def discriminator_block(model, filters, kernel_size, strides):
model = Conv2D(filters = filters, kernel_size = kernel_size, strides = strides, padding = "same")(model)
model = BatchNormalization(momentum = 0.5)(model)
model = LeakyReLU(alpha = 0.2)(model)
return model
class Discriminator(object):
def __init__(self, image_shape):
self.image_shape = image_shape
def discriminator(self):
dis_input = Input(shape = (self.image_shape))
model = Conv2D(filters = 64, kernel_size = 3, strides = 1, padding = "same")(dis_input)
model = LeakyReLU(alpha = 0.2)(model)
model = discriminator_block(model, 64, 3, 2)
model = discriminator_block(model, 128, 3, 1)
model = discriminator_block(model, 128, 3, 2)
model = discriminator_block(model, 256, 3, 1)
model = discriminator_block(model, 256, 3, 2)
model = discriminator_block(model, 512, 3, 1)
model = discriminator_block(model, 512, 3, 2)
model = Flatten()(model)
model = Dense(1024)(model)
model = LeakyReLU(alpha = 0.2)(model)
model = Dense(1)(model)
model = Activation('sigmoid')(model) 
discriminator_model = Model(inputs = dis_input, outputs = model)
return discriminator_model   
def get_gan_network(discriminator, shape_list_AB, generator, optimizer, loss):
discriminator.trainable = False
gan_input_A = Input(shape=shape_list_AB[0])
gan_input_B = Input(shape=shape_list_AB[1])
x = generator([gan_input_A,gan_input_B])
gan_output = discriminator(x)
gan = Model(inputs=[gan_input_A,gan_input_B], outputs=[x,gan_output])
gan.compile(loss=[loss, "binary_crossentropy"], loss_weights=[1., 1e-3], optimizer=optimizer)
return gan

def get_optimizer():
adam = Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
return adam


# =============================================================================
# choose some parameters and compile the model
# =============================================================================
batch_size = 128
shape_input_A = (40,40,4)
shape_input_B = (20,20,6)
shape_output = (40,40,6)

generator = Generator(shape_input_B).generator() # todo shape
discriminator = Discriminator(shape_output).discriminator() # todo shape
optimizer = get_optimizer()
generator.compile(loss="mse", optimizer=optimizer)
discriminator.compile(loss="binary_crossentropy", optimizer=optimizer)
gan = get_gan_network(discriminator, [shape_input_A,shape_input_B], generator, optimizer, "mse")  


# =============================================================================
# training
# =============================================================================
def get_random_data(mod):
# get the networks input
if mod == 0: 
return [np.random.rand(batch_size,40,40,4),np.random.rand(batch_size,20,20,6)]
# get the networks output
else: 
return np.random.rand(batch_size,40,40,6)

# initalize empty arrays
rand_nums = np.empty(batch_size,dtype=np.int)
image_batch_lr = np.empty((batch_size,)+shape_input_B)
image_batch_hr = np.empty((batch_size,)+shape_output)
generated_images_sr = np.empty_like(image_batch_hr)
real_data_Y = np.empty(batch_size)
fake_data_Y = np.empty(batch_size)
for e in range(1, 10):
print("epoch:",e)
for batchindex in range(200):
generated_images_sr[:] = generator.predict(get_random_data(0))
real_data_Y[:] = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
fake_data_Y[:] = np.random.random_sample(batch_size)*0.2
discriminator.trainable = True
d_loss_real = discriminator.train_on_batch(get_random_data(1), real_data_Y)
d_loss_fake = discriminator.train_on_batch(generated_images_sr, fake_data_Y)
discriminator_loss = 0.5 * np.add(d_loss_fake, d_loss_real)
gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
discriminator.trainable = False
gan_loss = gan.train_on_batch(get_random_data(0), [get_random_data(1),gan_Y])

print("discriminator_loss : %f" % discriminator_loss)
print("gan_loss :", gan_loss)

我在 docker 容器tensorflow/tensorflow:2.0.0-gpu-py3中的GTX2080上运行此代码。

训练 GAN 意味着一些不会在 GPU 上执行的开销。在您的情况下,获取real_data_Yfake_data_Y,执行get_random_data()并计算损失将导致GPU空闲时间。

您可以尝试使用python -mcProfile -o performance.prof xxx.py分析您的程序,看看是否存在可以改进的瓶颈,但 60% 到 70% 似乎已经不太糟糕了。

最新更新