我遵循此链接以在我自己的数据集上实现cdcgan。我的数据集包含几乎391510张图像。我的数据集的图像大小为64,而此链接中使用的MNIST为28。我的数据集具有2350个标签,其中MNIST数据集具有10个。
我的数据集以.tfrecords格式使用,因此我使用get_image()函数来检索图像和标签的批次,如下所示。当我运行代码时,我会收到以下错误
`tensorflow.python.framework.errors_impl.InternalError: Dst tensor is not initialized.
[[Node: _arg_Placeholder_3_0_3/_43 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_2488__arg_Placeholder_3_0_3", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]
[[Node: discriminator_1/batch_normalization/AssignMovingAvg_1/_86 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_2364_discriminator_1/batch_normalization/AssignMovingAvg_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]`
当我搜索此错误时,我发现如果批处理大小很大,则会发生,所以我将批次大小更改为32,然后得到了这个新错误。
` tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[32,64,64,2351] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[Node: discriminator/concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"](_arg_Placeholder_0_0/_41, _arg_Placeholder_3_0_3/_43, discriminator/concat/axis)]]
Caused by op 'discriminator/concat', defined at:
File "cdcgan.py", line 221, in <module>
D_real, D_real_logits = discriminator(x, y_fill, isTrain)
File "cdcgan.py", line 48, in discriminator
cat1 = tf.concat([x, y_fill], 3)
`
我更改默认代码的代码部分
IMAGE_WIDTH = 64
IMAGE_HEIGHT = 64
# G(z)
def generator(x, y_label, isTrain=True, reuse=False):
with tf.variable_scope('generator', reuse=reuse):
# initializer
w_init = tf.truncated_normal_initializer(mean=0.0, stddev=0.02)
b_init = tf.constant_initializer(0.0)
# concat layer
cat1 = tf.concat([x, y_label], 3)
# 1st hidden layer
deconv1 = tf.layers.conv2d_transpose(cat1, 256, [16, 16], strides=(1, 1), padding='valid', kernel_initializer=w_init, bias_initializer=b_init)
lrelu1 = lrelu(tf.layers.batch_normalization(deconv1, training=isTrain), 0.2)
# 2nd hidden layer
deconv2 = tf.layers.conv2d_transpose(lrelu1, 128, [5, 5], strides=(2, 2), padding='same', kernel_initializer=w_init, bias_initializer=b_init)
lrelu2 = lrelu(tf.layers.batch_normalization(deconv2, training=isTrain), 0.2)
# output layer
deconv3 = tf.layers.conv2d_transpose(lrelu2, 1, [5, 5], strides=(2, 2), padding='same', kernel_initializer=w_init, bias_initializer=b_init)
o = tf.nn.tanh(deconv3)
return o
# D(x)
def discriminator(x, y_fill, isTrain=True, reuse=False):
with tf.variable_scope('discriminator', reuse=reuse):
# initializer
w_init = tf.truncated_normal_initializer(mean=0.0, stddev=0.02)
b_init = tf.constant_initializer(0.0)
# concat layer
cat1 = tf.concat([x, y_fill], 3)
# 1st hidden layer
conv1 = tf.layers.conv2d(cat1, 128, [5, 5], strides=(2, 2), padding='same', kernel_initializer=w_init, bias_initializer=b_init)
lrelu1 = lrelu(conv1, 0.2)
# 2nd hidden layer
conv2 = tf.layers.conv2d(lrelu1, 256, [5, 5], strides=(2, 2), padding='same', kernel_initializer=w_init, bias_initializer=b_init)
lrelu2 = lrelu(tf.layers.batch_normalization(conv2, training=isTrain), 0.2)
# output layer
conv3 = tf.layers.conv2d(lrelu2, 1, [16, 16], strides=(1, 1), padding='valid', kernel_initializer=w_init)
o = tf.nn.sigmoid(conv3)
return o, conv3
def get_image(files, num_classes):
"""This method defines the retrieval image examples from TFRecords files.
Here we will define how the images will be represented (grayscale,
flattened, floating point arrays) and how labels will be represented
(one-hot vectors).
"""
# Convert filenames to a queue for an input pipeline.
file_queue = tf.train.string_input_producer(files)
# Create object to read TFRecords.
reader = tf.TFRecordReader()
# Read the full set of features for a single example.
key, example = reader.read(file_queue)
# Parse the example to get a dict mapping feature keys to tensors.
# image/class/label: integer denoting the index in a classification layer.
# image/encoded: string containing JPEG encoded image
features = tf.parse_single_example(
example,
features={
'image/class/label': tf.FixedLenFeature([], tf.int64),
'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
default_value='')
})
label = features['image/class/label']
image_encoded = features['image/encoded']
# Decode the JPEG.
image = tf.image.decode_jpeg(image_encoded, channels=1)
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
image = tf.reshape(image, [IMAGE_WIDTH*IMAGE_HEIGHT])
# Represent the label as a one hot vector.
label = tf.stack(tf.one_hot(label, num_classes))
return label, image
# training parameters
batch_size = 32
# lr = 0.0002
train_epoch = 30
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(0.0002, global_step, 500, 0.95, staircase=True)
# load MNIST
#mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, reshape=[])
SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
# Default paths.
DEFAULT_LABEL_FILE = os.path.join(SCRIPT_PATH, './labels.txt')
DEFAULT_TFRECORDS_DIR = os.path.join(SCRIPT_PATH, 'tfrecords-output')
MODEL_NAME = 'hangul_tensorflow'
IMAGE_WIDTH = 64
IMAGE_HEIGHT = 64
DEFAULT_NUM_TRAIN_STEPS = 117453 # (for 30 epochs as my training set is 391510)
"""Perform graph definition and model training.
Here we will first create our input pipeline for reading in TFRecords
files and producing random batches of images and labels.
"""
labels = io.open(DEFAULT_LABEL_FILE, 'r', encoding='utf-8').read().splitlines()
num_classes = len(labels)
print('Processing data...')
tf_record_pattern = os.path.join(DEFAULT_TFRECORDS_DIR, '%s-*' % 'train')
train_data_files = tf.gfile.Glob(tf_record_pattern)
label, image = get_image(train_data_files, num_classes)
# Associate objects with a randomly selected batch of labels and images.
image_batch, label_batch = tf.train.shuffle_batch(
[image, label], batch_size=batch_size,
capacity=2000,
min_after_dequeue=1000)
# variables : input
x = tf.placeholder(tf.float32, shape=(None, img_size, img_size, 1))
z = tf.placeholder(tf.float32, shape=(None, 1, 1, 100))
y_label = tf.placeholder(tf.float32, shape=(None, 1, 1, 2350))
y_fill = tf.placeholder(tf.float32, shape=(None, img_size, img_size, 2350))
isTrain = tf.placeholder(dtype=tf.bool)
# Initialize the queue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
# training-loop
print('training start!')
for epoch in range(train_epoch):
G_losses = []
D_losses = []
for iter in range(117453): #steps for 1 epoch
# update discriminator
train_images, train_labels = sess.run([image_batch, label_batch])
x_ = train_images.reshape(-1, img_size, img_size, 1)
y_label_ = train_labels.reshape([batch_size, 1, 1, 2350])
y_fill_ = y_label_ * np.ones([batch_size, img_size, img_size, 2350])
z_ = np.random.normal(0, 1, (batch_size, 1, 1, 100))
loss_d_, _ = sess.run([D_loss, D_optim], {x: x_, z: z_, y_fill: y_fill_, y_label: y_label_, isTrain: True})
# update generator
z_ = np.random.normal(0, 1, (batch_size, 1, 1, 100))
y_ = np.random.randint(0, 9, (batch_size, 1))
y_label_ = onehot[y_.astype(np.int32)].reshape([batch_size, 1, 1, 2350])
y_fill_ = y_label_ * np.ones([batch_size, img_size, img_size, 2350])
loss_g_, _ = sess.run([G_loss, G_optim], {z: z_, x: x_, y_fill: y_fill_, y_label: y_label_, isTrain: True})
这些是我的系统规格
名称:GeForce GTX 1070专业:6小型:1 MemoryClockrate(GHz):1.645PCIBUSID:0000:01:00.0总成员:8.00Gib freemory:6.62Gib
如何解决我的问题?
因此,在搜索自我后,我来了解决方案。我从这个答案中应用了一些技巧。我将批量的大小从32降低到16,这导致了缓慢的训练,但我不得不进行一些权衡。我还通过减少NO来改变D和G的结构。隐藏层中的神经元。最后,我从上面的答案中应用了一些张量的内存分配提示,这对我有帮助。
我希望我的答案能帮助像我这样的初学者。