我如何使Tensorflow使用更多的RAM?



我正在尝试用tensorflow/keras构建GAN。我有32GB的内存,我甚至在此基础上又增加了70GB的虚拟内存。当我尝试使用大约40000张(128,128,3)的图像时,tensorflow停止,我得到:

2021-01-22 00:12:42.680822: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 9232121856 exceeds 10% of free system memory.

然而,它使用的最大内存量是25GB,而我的理论内存是100GB。即使tensorflow不能使用虚拟内存,它也只使用了我80%的内存棒。改变批处理大小和可训练参数的数量没有任何作用。是在说VRAM吗?我做错了什么吗?我如何使tensorflow使用更多的RAM?
我目前使用的是:
python 3.8
tf-gpu 2.4.0rc1
keras 2.4.3

编辑:我试着把图像的分辨率降低到(64,64,3),这让我可以使用我的整个数据集106000。这一次它进行了训练,但仍然给出了10%的内存警告,但它只使用了我70%的记忆棒。

完整代码:

import imageio, os, PIL, random, pickle
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import layers
import time
from cv2 import cv2
from IPython import display
config = tf.compat.v1.ConfigProto(gpu_options = tf.compat.v1.GPUOptions(
per_process_gpu_memory_fraction=0.725))
for device in tf.config.experimental.list_physical_devices("GPU"):
tf.config.experimental.set_memory_growth(device, True)
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)
IMG_SIZE = 128
BATCH_SIZE = 64
EPOCHS = 500000
noise_dim = 100
num_examples_to_generate = 96
paths = glob('F:/DATA/E621_GAN/**', recursive=True)
del paths[0]
print(int((len(paths)) / BATCH_SIZE), "STEPS")

def make_generator_model():
model = tf.keras.Sequential()
model.add(layers.Dense(int(IMG_SIZE/8)*int(IMG_SIZE/8)*256, use_bias=False, input_shape=(noise_dim,)))
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Reshape((int(IMG_SIZE/8), int(IMG_SIZE/8), 256)))
assert model.output_shape == (None, int(IMG_SIZE/8), int(IMG_SIZE/8), 256) # Note: None is the batch size
model.add(layers.Conv2DTranspose(256, (7, 7), strides=(1, 1), padding='same', use_bias=False))
assert model.output_shape == (None, int(IMG_SIZE/8), int(IMG_SIZE/8), 256)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv2DTranspose(128, (7, 7), strides=(2, 2), padding='same', use_bias=False))
assert model.output_shape == (None, int(IMG_SIZE/4), int(IMG_SIZE/4), 128)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv2DTranspose(64, (7, 7), strides=(2, 2), padding='same', use_bias=False))
assert model.output_shape == (None, int(IMG_SIZE/2), int(IMG_SIZE/2), 64)
model.add(layers.BatchNormalization())
model.add(layers.LeakyReLU())
model.add(layers.Conv2DTranspose(3, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))
assert model.output_shape == (None, IMG_SIZE, IMG_SIZE, 3)
model.summary()
return model
generator = make_generator_model()
noise = tf.random.normal([1, noise_dim])
generated_image = generator(noise, training=False)
def make_discriminator_model():
model = tf.keras.Sequential()
model.add(layers.Conv2D(32, (7, 7), strides=(2, 2), padding='same', input_shape=(IMG_SIZE, IMG_SIZE, 3)))
model.add(layers.LeakyReLU())
model.add(layers.Dropout(0.25))
model.add(layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same'))
model.add(layers.LeakyReLU())
model.add(layers.Dropout(0.25))
model.add(layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'))
model.add(layers.LeakyReLU())
model.add(layers.Dropout(0.25))
model.add(layers.Flatten())
model.add(layers.Dense(1))
model.summary()
return model
discriminator = make_discriminator_model()
decision = discriminator(generated_image)
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
def discriminator_loss(real_output, fake_output):
real_loss = cross_entropy(tf.ones_like(real_output), real_output)
fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
total_loss = real_loss + fake_loss
return total_loss

def generator_loss(fake_output):
return cross_entropy(tf.ones_like(fake_output), fake_output)
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)
checkpoint_dir = 'D:/DATA/E621_SAVES/checkpoints/'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
discriminator_optimizer=discriminator_optimizer,
generator=generator,
discriminator=discriminator)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
seed = tf.random.normal([num_examples_to_generate, noise_dim])
@tf.function
def train_step(images):
noise = tf.random.normal([BATCH_SIZE, noise_dim])
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
generated_images = generator(noise, training=True)
real_output = discriminator(images, training=True)
fake_output = discriminator(generated_images, training=True)
gen_loss = generator_loss(fake_output)
disc_loss = discriminator_loss(real_output, fake_output)
gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
return gen_loss, disc_loss
print("Loading Dataset")
images = []
# 1 GB of images = about 50 GB of data
for i in range(len(paths)-60000):
try:
image_path = paths[i]
image = cv2.imread(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = np.float32(image)
images.append((image - 127.5) / 127.5)
except:
pass
print("Finished Loading Dataset")
images = np.array(images)
print(images.shape)
train_dataset = tf.data.Dataset.from_tensor_slices(images).shuffle(99999999).batch(BATCH_SIZE)
print("Finished Organizing Dataset")
epoch_save = 0
try:
data = [epoch_save, seed]
with open('D:/DATA/E621_SAVES/checkpoints/epoch.pkl', 'rb') as save:
data = pickle.load(save)
epoch_save = data[0]
seed = data[1]
except:
print("No pickle found")
def train(dataset, epochs, epoch_save, seed):
for epoch in range(epochs - epoch_save):
start = time.time()
epoch_save += 1
gen_loss_list, disc_loss_list = [], []
for image_batch in dataset:
t = train_step(image_batch)
gen_loss_list.append(t[0])
disc_loss_list.append(t[1])

g_loss = sum(gen_loss_list) / len(gen_loss_list)
d_loss = sum(disc_loss_list) / len(disc_loss_list)
display.clear_output(wait=True)
generate_and_save_images(generator,
epoch_save,
seed)
if (epoch + 1) % 25 == 0:
checkpoint.save(file_prefix = checkpoint_prefix)
data = [epoch_save, seed]
with open('D:/DATA/E621_SAVES/checkpoints/epoch.pkl', 'wb') as save:
pickle.dump(data, save)


print(f'Epoch {epoch_save}: gen_loss={g_loss}, disc_loss={d_loss}, time: {np.round(time.time()-start, 4)} sec')
fig = plt.figure(figsize=(15.36,10.24))
def generate_and_save_images(model, epoch, test_input):
predictions = model(test_input, training=False)
ax = []
for i in range(predictions.shape[0]):
ax.append(plt.subplot(8,12,i+1))
plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5)
for a in ax:
a.set_xticks([])
a.set_yticks([])
fig.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, wspace=0, hspace=0)
plt.savefig('D:/DATA/E621_SAVES/epoch_{:04d}.png'.format(epoch), dpi=100)
plt.clf()
train(train_dataset, EPOCHS, epoch_save, seed)

输出:

2021-01-22 00:12:03.372768: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2021-01-22 00:12:05.743931: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-01-22 00:12:05.744894: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library nvcuda.dll
2021-01-22 00:12:05.765514: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 3070 computeCapability: 8.6
coreClock: 1.725GHz coreCount: 46 deviceMemorySize: 8.00GiB deviceMemoryBandwidth: 417.29GiB/s
2021-01-22 00:12:05.765726: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2021-01-22 00:12:05.775582: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2021-01-22 00:12:05.775689: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2021-01-22 00:12:05.778527: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2021-01-22 00:12:05.779685: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2021-01-22 00:12:05.785534: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2021-01-22 00:12:05.787678: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2021-01-22 00:12:05.788518: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2021-01-22 00:12:05.788665: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2021-01-22 00:12:05.789173: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-01-22 00:12:05.790050: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 3070 computeCapability: 8.6
coreClock: 1.725GHz coreCount: 46 deviceMemorySize: 8.00GiB deviceMemoryBandwidth: 417.29GiB/s
2021-01-22 00:12:05.790231: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2021-01-22 00:12:05.790305: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2021-01-22 00:12:05.790383: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2021-01-22 00:12:05.790582: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2021-01-22 00:12:05.790704: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2021-01-22 00:12:05.790816: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2021-01-22 00:12:05.790887: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2021-01-22 00:12:05.790954: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2021-01-22 00:12:05.791139: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2021-01-22 00:12:06.170269: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-01-22 00:12:06.170365: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267]      0 
2021-01-22 00:12:06.170489: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0:   N 
2021-01-22 00:12:06.170738: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 5939 MB memory) -> physical GPU (device: 0, name: GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6)
2021-01-22 00:12:06.171371: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
WARNING:tensorflow:From f:PYTHONGANGAN.py:16: The name tf.keras.backend.set_session is deprecated. Please use tf.compat.v1.keras.backend.set_session instead.
1671 STEPS
2021-01-22 00:12:08.495267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 3070 computeCapability: 8.6
coreClock: 1.725GHz coreCount: 46 deviceMemorySize: 8.00GiB deviceMemoryBandwidth: 417.29GiB/s
2021-01-22 00:12:08.495435: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2021-01-22 00:12:08.495586: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2021-01-22 00:12:08.495694: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2021-01-22 00:12:08.495766: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2021-01-22 00:12:08.495835: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2021-01-22 00:12:08.495905: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2021-01-22 00:12:08.495982: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2021-01-22 00:12:08.496054: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2021-01-22 00:12:08.496143: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2021-01-22 00:12:08.496223: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-01-22 00:12:08.496293: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267]      0 
2021-01-22 00:12:08.496340: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0:   N 
2021-01-22 00:12:08.496454: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 5939 MB memory) -> physical GPU (device: 0, name: GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6)
2021-01-22 00:12:08.496788: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 65536)             6553600   
_________________________________________________________________
batch_normalization (BatchNo (None, 65536)             262144    
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 65536)             0         
_________________________________________________________________
reshape (Reshape)            (None, 16, 16, 256)       0         
_________________________________________________________________
conv2d_transpose (Conv2DTran (None, 16, 16, 256)       3211264   
_________________________________________________________________
batch_normalization_1 (Batch (None, 16, 16, 256)       1024      
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 16, 16, 256)       0         
_________________________________________________________________
conv2d_transpose_1 (Conv2DTr (None, 32, 32, 128)       1605632   
_________________________________________________________________
batch_normalization_2 (Batch (None, 32, 32, 128)       512       
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 32, 32, 128)       0         
_________________________________________________________________
conv2d_transpose_2 (Conv2DTr (None, 64, 64, 64)        401408    
_________________________________________________________________
batch_normalization_3 (Batch (None, 64, 64, 64)        256       
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 64, 64, 64)        0         
_________________________________________________________________
conv2d_transpose_3 (Conv2DTr (None, 128, 128, 3)       4800      
=================================================================
Total params: 12,040,640
Trainable params: 11,908,672
Non-trainable params: 131,968
_________________________________________________________________
2021-01-22 00:12:08.793121: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2021-01-22 00:12:09.334054: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2021-01-22 00:12:09.335006: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2021-01-22 00:12:09.337482: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2021-01-22 00:12:10.319075: I tensorflow/core/platform/windows/subprocess.cc:308] SubProcess ended with return code: 0
2021-01-22 00:12:10.349989: I tensorflow/core/platform/windows/subprocess.cc:308] SubProcess ended with return code: 0
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 64, 64, 32)        4736      
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 64, 64, 32)        0         
_________________________________________________________________
dropout (Dropout)            (None, 64, 64, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 32, 32, 64)        51264     
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 32, 32, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 32, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 16, 16, 128)       204928    
_________________________________________________________________
leaky_re_lu_6 (LeakyReLU)    (None, 16, 16, 128)       0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 16, 16, 128)       0         
_________________________________________________________________
flatten (Flatten)            (None, 32768)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 32769     
=================================================================
Total params: 293,697
Trainable params: 293,697
Non-trainable params: 0
_________________________________________________________________
Loading Dataset
Finished Loading Dataset
(46957, 128, 128, 3)
Finished Organizing Dataset
No pickle found
2021-01-22 00:12:42.680822: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 9232121856 exceeds 10% of free system memory.

如果您包含虚拟内存,即使使用NVMe驱动器,由于缓存速度减慢,您也不应该看到100%的内存使用率。除此之外,TensorFlow预测地抛出该标志。TensorFlow上的GPU计算往往会占用大量RAM。我最好的建议是将批处理大小降低2倍,直到它运行。

我不确定是否相关的另一点是,在nVidia的Isaac应用论坛中报告了大量RTX3xxx卡的问题。

相关内容

  • 没有找到相关文章

最新更新