我正在尝试制作一个VAE来编码电影名称,然后在8核GPU上进行训练。该模型在单个 GPU 上按预期编译和适合,但当我尝试在多个 GPU 上运行它时会中断。以下是自动编码器的基本代码:
from keras.layers import Input, GRU, RepeatVector, Conv1D, Dense, TimeDistributed, Dropout, MaxPooling1D
from keras.models import Model
from keras.utils import to_categorical, plot_model
from keras.callbacks import ModelCheckpoint
import numpy as np
from keras import backend as K
from keras import metrics
from keras.layers import Lambda, Flatten, Layer
from keras import losses
import tensorflow as tf
import random
# Open file with 20k movie names from imdb
movies = open('/home/ubuntu/MovieNames/data/movies.dat')
data = []
# read data
for line in movies:
data += [line.split("t")]
names = [x[1] for x in data]
# get rid of the header
movie_names = names[1:]
chars = list('abcdefghijklmnopqrstuvwxyz ') + ['<END>', '<NULL>']
indices_for_chars = {c: i for i, c in enumerate(chars)}
NAME_MAX_LEN = 35 # include the <END> char
def name_to_vec(name, maxlen=NAME_MAX_LEN):
name_lowercase = name.lower()
v = np.zeros(maxlen, dtype=int)
null_idx = indices_for_chars['<NULL>']
v.fill(null_idx)
# ignore cases
for i, c in enumerate(name_lowercase):
if i >= maxlen: break
n = indices_for_chars.get(c, null_idx)
v[i] = n
v[min(len(name_lowercase), maxlen-1)] = indices_for_chars['<END>']
return v
# convert to Keras-compatible form
names = np.array([to_categorical(name_to_vec(name),num_classes=len(chars)) for name in movie_names])
# Global parameters
NAME_LENGTH = names.shape[1]
ALPHABET = names.shape[2]
latent_dim = 10 * 8
intermediate_dim = 24 * 8
batch_size = 100 * 8
epochs = 20
epsilon_std = 0.01
i = Input(shape=(NAME_LENGTH, ALPHABET))
x = Conv1D(256, 9)(i)
x = Dropout(0.2)(x) # o
x = Conv1D(256, 7)(x)
x = MaxPooling1D(pool_length=3)(x)
x = Dropout(0.2)(x)
x = Conv1D(256, 3)(x)
x = Dropout(0.2)(x)
x = Flatten()(x)
x = Dense(intermediate_dim, activation='relu')(x)
x = Dropout(0.2)(x)
z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)
def sampling(args):
z_mean, z_log_var = args
epsilon = K.random_normal(shape=(batch_size, latent_dim),
mean=0., stddev=epsilon_std)
return z_mean + K.exp(z_log_var) * epsilon
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
h = Dense(intermediate_dim, activation='relu')(z)
h = RepeatVector(NAME_LENGTH)(h)
h = GRU(256, return_sequences=True)(h)
h = Dropout(0.2)(h)
h = GRU(256, return_sequences=True)(h)
h = TimeDistributed(Dense(ALPHABET, activation='softmax'), name='decoded_mean')(h)
autoencoder = Model(i, h)
def vae_objective(y_true, y_pred):
recon = K.sum(K.categorical_crossentropy(y_pred,y_true),axis=1)
kl = 0.5 * K.sum(K.exp(z_log_var) + K.square(z_mean) - 1. - z_log_var,axis=1)
return recon + kl
然后,我使用 Keras 多 GPU 工具并行化代码:
from keras import backend as K
from keras.models import Model
from keras.layers import Input
from keras.layers.core import Lambda
from keras.layers.merge import Concatenate
def slice_batch(x, n_gpus, part):
"""
Divide the input batch into [n_gpus] slices, and obtain slice no. [part].
i.e. if len(x)=10, then slice_batch(x, 2, 1) will return x[5:].
"""
sh = K.shape(x)
L = sh[0] / n_gpus
if part == n_gpus - 1:
return x[part*L:]
return x[part*L:(part+1)*L]
def to_multi_gpu(model, n_gpus=2):
"""Given a keras [model], return an equivalent model which parallelizes
the computation over [n_gpus] GPUs.
Each GPU gets a slice of the input batch, applies the model on that slice
and later the outputs of the models are concatenated to a single tensor,
hence the user sees a model that behaves the same as the original.
"""
with tf.device('/cpu:0'):
x = Input(model.input_shape[1:], name=model.input_names[0])
towers = []
for g in range(n_gpus):
with tf.device('/gpu:' + str(g)):
slice_g = Lambda(slice_batch, lambda shape: shape, arguments={'n_gpus':n_gpus, 'part':g})(x)
towers.append(model(slice_g))
with tf.device('/cpu:0'):
merged = Concatenate(axis=0)(towers)
return Model(inputs=[x], outputs=[merged])
当需要适应它时,就是我遇到问题的时候:
model = to_multi_gpu(autoencoder, n_gpus=8)
model.compile(loss=vae_objective, optimizer='adam', metrics=["accuracy"])
model.fit(names[:8000], names[:8000], batch_size=batch_size)
给我以下错误:
InvalidArgumentError: You must feed a value for placeholder tensor 'input_4' with dtype float
[[Node: input_4 = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
请注意,所有参数都可以被 GPU 的数量整除,所以我不认为这是问题所在。
使用
model = to_multi_gpu(autoencoder, n_gpus=8)
model.compile(loss=vae_objective, optimizer='adam', metrics=["accuracy"])
model.fit(names[:8000], names[:8000], batch_size=batch_size*8)
即使用batch_size对VAE进行编码,使用batch_size*GPU运行
确保样本大小可以除以 batch_size*gpus