运行 Tensorflow 时的 GPU 利用率低

我一直在使用Tensorflow和OpenAI gym进行深度强化学习。我的问题是 GPU 利用率低。谷歌搜索这个问题,我明白在训练小型网络(例如,训练mnist)时期望大量的GPU利用率是错误的。但我认为我的神经网络并没有那么小。该架构类似于原始Deepmind论文中给出的架构(或多或少)。我的网络架构总结如下

  1. 卷积层 1(过滤器=32,kernel_size=8x8,步长=4)

  2. 卷积层 2(过滤器=64,kernel_size=8x8,步长=2)

  3. 卷积层 3(过滤器=64,kernel_size=8x8,步长=1)

  4. 致密层(单位=512)

  5. 输出层(单位=9)

我正在特斯拉P100 16GB GPU上训练。我的学习算法是简单的DQN。(再次,来自Deepmind论文)。超参数都如论文中给出的。GPU利用率仍远低于10%(如nvidia-smi所示)。可能出现的问题是什么?

import tensorflow as tf
import numpy as np
import os, sys
import gym
from collections import deque
from time import sleep
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

def reset_graph(seed=142):

def preprocess_observation(obs):
img = obs[34:210:2, ::2] # crop and downsize
return np.mean(img, axis=2).reshape(88, 80) / 255.0

def combine_observations_multichannel(preprocessed_observations):
return np.array(preprocessed_observations).transpose([1, 2, 0])

n_observations_per_state = 3
preprocessed_observations = deque([], maxlen=n_observations_per_state)
env = gym.make("Breakout-v0")
obs = env.reset()

input_height = 88
input_width = 80
input_channels = 3
conv_n_maps = [32, 64, 64]
conv_kernel_sizes = [(8,8), (4,4), (3,3)]
conv_strides = [4, 2, 1]
conv_paddings = ["SAME"] * 3 
conv_activation = [tf.nn.relu] * 3
n_hidden_in = 64 * 11 * 10  # conv3 has 64 maps of 10x10 each
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n  # Number of discrete actions are available
initializer = tf.variance_scaling_initializer()

def q_network(X_state, name):
prev_layer = X_state
with tf.variable_scope(name) as scope:
for n_maps, kernel_size, strides, padding, activation in zip(
conv_n_maps, conv_kernel_sizes, conv_strides,
conv_paddings, conv_activation):
prev_layer = tf.layers.conv2d(
prev_layer, filters=n_maps, kernel_size=kernel_size,
strides=strides, padding=padding, activation=activation,
last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])
hidden = tf.layers.dense(last_conv_layer_flat, n_hidden,
outputs = tf.layers.dense(hidden, n_outputs,
trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
trainable_vars_by_name = {var.name[len(scope.name):]: var
for var in trainable_vars}
return outputs, trainable_vars_by_name

X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width,
online_q_values, online_vars = q_network(X_state, name="q_networks/online")
target_q_values, target_vars = q_network(X_state, name="q_networks/target")
copy_ops = [target_var.assign(online_vars[var_name])
for var_name, target_var in target_vars.items()]
copy_online_to_target = tf.group(*copy_ops)

learning_rate = 0.001
momentum = 0.95
with tf.variable_scope("train"):
X_action = tf.placeholder(tf.int32, shape=[None])
y = tf.placeholder(tf.float32, shape=[None, 1])
q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),
axis=1, keep_dims=True)
loss = tf.reduce_mean((y - q_value) ** 2) 
global_step = tf.Variable(0, trainable=False, name='global_step')
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss, global_step=global_step)

replay_memory_size = 500000
replay_memory = deque([], maxlen=replay_memory_size)
def sample_memories(batch_size):
indices = np.random.permutation(len(replay_memory))[:batch_size]
cols = [[], [], [], [], []] # state, action, reward, next_state, continue
for idx in indices:
memory = replay_memory[idx]
for col, value in zip(cols, memory):
cols = [np.array(col) for col in cols]
return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)

eps_min = 0.1
eps_max = 1.0
eps_decay_steps = 2000000

def epsilon_greedy(q_values, step):
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
if np.random.rand() < epsilon:
return np.random.randint(n_outputs) # random action
return np.argmax(q_values) # optimal action
n_steps = 4000000  # total number of training steps
training_start = 10000  # start training after 10,000 game iterations
training_interval = 4  # run a training step every 4 game iterations
save_steps = 1000  # save the model every 1,000 training steps
copy_steps = 10000  # copy online DQN to target DQN every 10,000 training steps
discount_rate = 0.99
skip_start = 5  # Skip the start of every game (it's just waiting time).
batch_size = 64
iteration = 0  # game iterations
checkpoint_dir = './saved_networks'
checkpoint_path = "./saved_networks/dqn_breakout.cpkt"
summary_path = "./summary/"
done = True # env needs to be reset
# Summary variables
svar_reward = tf.Variable(tf.zeros([1], dtype=tf.int32)) # Episode reward
svar_mmq = tf.Variable(tf.zeros([1]), dtype=tf.float32) # Episode Mean-Max-Q
svar_loss = tf.Variable(tf.zeros([1], dtype=tf.float64))
all_svars = [svar_reward, svar_mmq, svar_loss]
tf.summary.scalar("Episode Reward", tf.squeeze(svar_reward))
tf.summary.scalar("Episode Mean-Max-Q", tf.squeeze(svar_mmq))
tf.summary.scalar("Episode MSE", tf.squeeze(svar_loss))
# Placeholders
svar_reward_p, svar_mmq_p =  tf.placeholder(tf.int32, [1]), tf.placeholder(tf.float32, [1])
svar_loss_p = tf.placeholder(tf.float64, [1])
svars_placeholders = [svar_reward_p,  svar_mmq_p, svar_loss_p]
# Assign operation
summary_assign_op = [all_svars[i].assign(svars_placeholders[i]) for i in range(len(svars_placeholders))]
writer = tf.summary.FileWriter(summary_path)
summary_op = tf.summary.merge_all()
# For keeping track of no. of episodes played.
episode_step = tf.Variable(tf.zeros([1], dtype=tf.int64), trainable=False)
inc_episode_count = episode_step.assign_add([1])

init = tf.global_variables_initializer()
saver = tf.train.Saver()

loss_val = np.infty
game_length = 0
total_max_q = 0
mean_max_q = 0.0
ep_reward = 0
ep_loss = 0.
with tf.Session() as sess:
if os.path.isfile(checkpoint_path + ".index"):
saver.restore(sess, checkpoint_path)
print("<--------------------- Graph restored! -------------------------->")
print("<--------- No checkpoints found! Starting over.. ---------------->")
while True:
step = global_step.eval()
if step >= n_steps:
iteration += 1
print("rIteration {}tTraining step {}/{} ({:.1f})%tLoss {:5f}tMean Max-Q {:5f}   ".format(
iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="")
if done: # game over, start again
obs = env.reset()
# Clear observations from the past episode
for skip in range(skip_start): # skip the start of each game
obs, reward, done, info = env.step(0) # Do nothing
state = combine_observations_multichannel(preprocessed_observations)
# Online DQN evaluates what to do
q_values = online_q_values.eval(feed_dict={X_state: [state]})
action = epsilon_greedy(q_values, step)
# Online DQN plays
obs, reward, done, info = env.step(action)
ep_reward += reward
next_state = combine_observations_multichannel(preprocessed_observations)
# Let's memorize what happened
replay_memory.append((state, action, reward, next_state, 1.0 - done))
state = next_state
# Compute statistics for tracking progress
total_max_q += q_values.max()
game_length += 1
if done:
mean_max_q = total_max_q / game_length
# Write summary -- start
if iteration >= training_start:
sess.run(summary_assign_op, feed_dict={
svar_reward_p: [ep_reward],
svar_mmq_p: [mean_max_q],
svar_loss_p: [ep_loss],
summaries_str = sess.run(summary_op)
writer.add_summary(summaries_str, sess.run(episode_step))
# Write summary -- end
total_max_q = 0.0
game_length = ep_reward = ep_loss = 0
if iteration < training_start or iteration % training_interval != 0:
continue # only train after warmup period and at regular intervals
# Sample memories and use the target DQN to produce the target Q-Value
X_state_val, X_action_val, rewards, X_next_state_val, continues = (
next_q_values = target_q_values.eval(
feed_dict={X_state: X_next_state_val})
max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
y_val = rewards + continues * discount_rate * max_next_q_values
# Train the online DQN
_, loss_val = sess.run([training_op, loss], feed_dict={
X_state: X_state_val, X_action: X_action_val, y: y_val})
ep_loss += loss_val
# Regularly copy the online DQN to the target DQN
if step % copy_steps == 0:
# And save regularly
if step % save_steps == 0:
saver.save(sess, checkpoint_path)

原始DQN网络和您使用的网络对于Tesla P100 GPU来说都非常小。如果要利用更多,可以在同一 GPU 上运行多个实验。

如果没有更多细节(例如查看您的代码、知道您正在哪些健身房环境中训练、CPU 利用率、超参数值等),很难确定。一些可能的原因:

  • 小批量
  • 环境的step()功能仍将在您的 CPU 上运行,如果该部分花费大量时间,您的 GPU 将在每次迭代中闲置一段时间
  • 与上述相同,适用于训练循环每次迭代中的各种其他代码(例如跟踪结果、在重播缓冲区中存储内容、从重播缓冲区获取内容)


在简要检查代码后,我怀疑提高 GPU 利用率的最简单方法是将training_interval参数的值从4减少到例如1。基本上,所有基于张量流的代码都将在GPU上运行(至少应该如此),所有其他代码都将在CPU上运行。在不训练的迭代中,这意味着只有通过网络计算 Q 值的前向传递在 GPU 上运行,所有其他代码都在 CPU 上运行。在进行训练的迭代中,您将在 GPU 上运行更多代码:额外的转发传递与重播缓冲区中的样本,以及用于更新网络参数的匹配向后传递。因此,如果要提高 GPU 利用率,则需要通过增加运行在 GPU 上实际运行的代码的频率来实现。

除此之外,我认为还可以将你目前在Tensorflow之外进行的一些计算移动到Tensorflow中(因此将它们从CPU移动到GPU)。例如,你在Tensorflow之外进行epsilon-greedy动作选择,而OpenAI Baselines DQN实现在Tensorflow中这样做。
