我正在与TF-Agents DQN教程一起构建PPO代理。这个想法是检查一个简单的tf-agent工作所需的基本结构,并使其适应PPO agent。
我也使用自定义环境,ViZDoom。
当测试"collect_data">时,我得到一个错误函数。这是我正在运行的代码,在它之后,我得到的错误(完整的代码在底部):
data_doom_env = DoomEnvironment()
data_env = tf_py_environment.TFPyEnvironment(data_doom_env)
random_data_policy = random_tf_policy.RandomTFPolicy(data_env.time_step_spec(), data_env.action_spec())
collect_data(data_env, random_data_policy, replay_buffer, initial_collect_steps)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-d4548f4adc88> in <module>()
5 random_data_policy = random_tf_policy.RandomTFPolicy(data_env.time_step_spec(), data_env.action_spec())
6
----> 7 collect_data(data_env, random_data_policy, replay_buffer, initial_collect_steps)
4 frames
/usr/local/lib/python3.7/dist-packages/tf_agents/utils/nest_utils.py in assert_same_structure(nest1, nest2, check_types, expand_composites, allow_shallow_nest1, message)
124 lambda _: _DOT, nest2, expand_composites=expand_composites)
125 raise exception('{}:n {}nvs.n {}nValues:n {}nvs.n {}.'
--> 126 .format(message, str1, str2, nest1, nest2))
127
128
TypeError: The two structures do not match:
Trajectory(
{'action': .,
'discount': .,
'next_step_type': .,
'observation': .,
'policy_info': (),
'reward': .,
'step_type': .})
vs.
Trajectory(
{'action': .,
'discount': .,
'next_step_type': .,
'observation': .,
'policy_info': DictWrapper({'dist_params': DictWrapper({'logits': .})}),
'reward': .,
'step_type': .})
Values:
Trajectory(
{'action': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>,
'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
'next_step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>,
'observation': <tf.Tensor: shape=(1, 1, 160, 260, 3), dtype=float32, numpy=
array([[[[[0.21568628, 0.21568628, 0.21568628],
[0.2627451 , 0.2627451 , 0.2627451 ],
[0.29411766, 0.29411766, 0.29411766],
...,
[0.29411766, 0.29411766, 0.29411766],
[0.2627451 , 0.2627451 , 0.2627451 ],
[0.15294118, 0.15294118, 0.15294118]],
[[0.10588235, 0.10588235, 0.10588235],
[0.15294118, 0.15294118, 0.15294118],
[0.21568628, 0.21568628, 0.21568628],
...,
[0.15294118, 0.15294118, 0.15294118],
[0.10588235, 0.10588235, 0.10588235],
[0.10588235, 0.10588235, 0.10588235]],
[[0.15294118, 0.15294118, 0.15294118],
[0.13725491, 0.13725491, 0.13725491],
[0.13725491, 0.13725491, 0.13725491],
...,
[0.15294118, 0.15294118, 0.15294118],
[0.15294118, 0.15294118, 0.15294118],
[0.15294118, 0.15294118, 0.15294118]],
...,
[[0.43529412, 0.34117648, 0.2627451 ],
[0.43529412, 0.34117648, 0.2627451 ],
[0.43529412, 0.34117648, 0.2627451 ],
...,
[0.43529412, 0.34117648, 0.2627451 ],
[0.46666667, 0.37254903, 0.29411766],
[0.46666667, 0.37254903, 0.29411766]],
[[0.48235294, 0.3882353 , 0.30980393],
[0.48235294, 0.3882353 , 0.30980393],
[0.5137255 , 0.41960785, 0.34117648],
...,
[0.46666667, 0.37254903, 0.29411766],
[0.46666667, 0.37254903, 0.29411766],
[0.46666667, 0.37254903, 0.29411766]],
[[0.40392157, 0.3254902 , 0.24705882],
[0.43529412, 0.34117648, 0.2627451 ],
[0.43529412, 0.34117648, 0.2627451 ],
...,
[0.43529412, 0.34117648, 0.2627451 ],
[0.43529412, 0.34117648, 0.2627451 ],
[0.43529412, 0.34117648, 0.2627451 ]]]]], dtype=float32)>,
'policy_info': (),
'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-1.], dtype=float32)>,
'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>})
vs.
Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='selected_action', minimum=array(0, dtype=int32), maximum=array(4, dtype=int32)),
'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
'observation': BoundedTensorSpec(shape=(1, 160, 260, 3), dtype=tf.float32, name='screen_observation', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
'policy_info': {'dist_params': {'logits': TensorSpec(shape=(5,), dtype=tf.float32, name='CategoricalProjectionNetwork_logits')}},
'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}).
我已经尝试了David Braun在这个StackOverflow问题中提出的建议。它没有工作(接下来是代码和错误),我真的不明白为什么他这么做是为了让他的代码工作,而TF-Agents的官方教程不需要这样做。总之,代码:
#Testing the data collection function with a random policy and with David Braun's suggestion
data_doom_env = DoomEnvironment()
data_env = tf_py_environment.TFPyEnvironment(data_doom_env)
random_data_policy = random_tf_policy.RandomTFPolicy(data_env.time_step_spec(), data_env.action_spec())
collect_data(data_env, random_data_policy, replay_buffer, initial_collect_steps)
和错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-13-d4548f4adc88> in <module>()
5 random_data_policy = random_tf_policy.RandomTFPolicy(data_env.time_step_spec(), data_env.action_spec())
6
----> 7 collect_data(data_env, random_data_policy, replay_buffer, initial_collect_steps)
4 frames
/usr/local/lib/python3.7/dist-packages/tf_agents/utils/nest_utils.py in assert_same_structure(nest1, nest2, check_types, expand_composites, allow_shallow_nest1, message)
124 lambda _: _DOT, nest2, expand_composites=expand_composites)
125 raise exception('{}:n {}nvs.n {}nValues:n {}nvs.n {}.'
--> 126 .format(message, str1, str2, nest1, nest2))
127
128
TypeError: The two structures do not match:
Trajectory(
{'action': .,
'discount': .,
'next_step_type': .,
'observation': .,
'policy_info': (),
'reward': .,
'step_type': .})
vs.
Trajectory(
{'action': .,
'discount': .,
'next_step_type': .,
'observation': .,
'policy_info': DictWrapper({'dist_params': DictWrapper({'logits': .})}),
'reward': .,
'step_type': .})
Values:
Trajectory(
{'action': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[0]], dtype=int32)>,
'discount': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[1.]], dtype=float32)>,
'next_step_type': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[1]], dtype=int32)>,
'observation': <tf.Tensor: shape=(1, 1, 1, 160, 260, 3), dtype=float32, numpy=
array([[[[[[0.2627451 , 0.2627451 , 0.2627451 ],
[0.29411766, 0.29411766, 0.29411766],
[0.2627451 , 0.2627451 , 0.2627451 ],
...,
[0.2627451 , 0.2627451 , 0.2627451 ],
[0.21568628, 0.21568628, 0.21568628],
[0.15294118, 0.15294118, 0.15294118]],
[[0.10588235, 0.10588235, 0.10588235],
[0.21568628, 0.21568628, 0.21568628],
[0.2627451 , 0.2627451 , 0.2627451 ],
...,
[0.15294118, 0.15294118, 0.15294118],
[0.10588235, 0.10588235, 0.10588235],
[0.10588235, 0.10588235, 0.10588235]],
[[0.13725491, 0.13725491, 0.13725491],
[0.13725491, 0.13725491, 0.13725491],
[0.15294118, 0.15294118, 0.15294118],
...,
[0.15294118, 0.15294118, 0.15294118],
[0.15294118, 0.15294118, 0.15294118],
[0.15294118, 0.15294118, 0.15294118]],
...,
[[0.43529412, 0.34117648, 0.2627451 ],
[0.43529412, 0.34117648, 0.2627451 ],
[0.43529412, 0.34117648, 0.2627451 ],
...,
[0.46666667, 0.37254903, 0.29411766],
[0.46666667, 0.37254903, 0.29411766],
[0.46666667, 0.37254903, 0.29411766]],
[[0.48235294, 0.3882353 , 0.30980393],
[0.5137255 , 0.41960785, 0.34117648],
[0.5137255 , 0.41960785, 0.34117648],
...,
[0.46666667, 0.37254903, 0.29411766],
[0.46666667, 0.37254903, 0.29411766],
[0.46666667, 0.37254903, 0.29411766]],
[[0.43529412, 0.34117648, 0.2627451 ],
[0.37254903, 0.29411766, 0.21568628],
[0.37254903, 0.29411766, 0.21568628],
...,
[0.43529412, 0.34117648, 0.2627451 ],
[0.43529412, 0.34117648, 0.2627451 ],
[0.43529412, 0.34117648, 0.2627451 ]]]]]], dtype=float32)>,
'policy_info': (),
'reward': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-1.]], dtype=float32)>,
'step_type': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[0]], dtype=int32)>})
vs.
Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='selected_action', minimum=array(0, dtype=int32), maximum=array(4, dtype=int32)),
'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
'observation': BoundedTensorSpec(shape=(1, 160, 260, 3), dtype=tf.float32, name='screen_observation', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
'policy_info': {'dist_params': {'logits': TensorSpec(shape=(5,), dtype=tf.float32, name='CategoricalProjectionNetwork_logits')}},
'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}).
我注意到两个轨迹的观察形状是不同的。应该是shape=(1,160, 260, 3),但其中一个观测值的形状为=(1,1,160, 260, 3). 我不知道这是为什么,我应该在哪里尝试解决这个问题,因为我认为这正是大卫·布劳恩的答案。
我真的不知道如何继续,也不知道还能尝试什么,我真的被困住了。有人知道为什么轨迹会显示不同的结构吗?我该如何修复它?
完整代码:
from google.colab import drive
drive.mount('/content/drive')
#%%bash
# Install deps from
# https://github.com/mwydmuch/ViZDoom/blob/master/doc/Building.md#-linux
!sudo apt update
!sudo apt upgrade
!sudo apt install build-essential zlib1g-dev libsdl2-dev libjpeg-dev nasm tar libbz2-dev libgtk2.0-dev
cmake git libfluidsynth-dev libgme-dev libopenal-dev timidity libwildmidi-dev unzip
# Boost libraries
!sudo apt install libboost-all-dev
# Lua binding dependencies
!sudo apt install liblua5.1-dev
!pip install tf-agents
!pip install git+https://github.com/mwydmuch/ViZDoom
#!pip uninstall vizdoom
#!pip install vizdoom
!sudo apt update
!sudo apt upgrade
from vizdoom import *
import numpy as np
import pandas as pd
import seaborn as sbrn
from __future__ import absolute_import, division, print_function
import tensorflow as tf
from tensorflow import keras
from tf_agents.agents.ppo import ppo_agent
from tf_agents.environments import py_environment
from tf_agents.environments import tf_py_environment
from tf_agents.specs import array_spec, BoundedArraySpec, ArraySpec
from tf_agents.networks.actor_distribution_rnn_network import ActorDistributionRnnNetwork
from tf_agents.networks.value_rnn_network import ValueRnnNetwork
from tf_agents.trajectories import trajectory, time_step
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
import time
import random
class DoomEnvironment (py_environment.PyEnvironment):
def __init__(self):
super().__init__()
self._game = self.create_environment()
self._state = self._game.get_state()
self._num_actions = self._game.get_available_buttons_size()
self._screen_type = 0
self._frame_stack_size = 1
self._stacked_frames = np.zeros((self._frame_stack_size, 160, 260, 3), dtype=np.float32)
self._doom_vars = [0] * 9
self._action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=self._num_actions - 1, name='selected_action')
#1. Single image observation
self._observation_spec = array_spec.BoundedArraySpec(shape=(self._frame_stack_size, 160, 260, 3), dtype=np.float32, minimum=0, maximum=1, name='screen_observation')
def create_environment(self):
### New game instance
game = DoomGame()
### Adjust configuration file path
game.load_config('/content/drive/My Drive/ViZDoom/Config/Test_configuration.cfg')
### Adjust game scenario path
game.set_doom_scenario_path('/content/drive/My Drive/ViZDoom/Maps/basic.wad')
### Google Colab does not support the video output of ViZDoom. The following line is needed for the environment to be run.
game.set_window_visible(False)
### Adding relevant variables (reward-related variables)
game.add_available_game_variable(GameVariable.USER1)
game.add_available_game_variable(GameVariable.USER2)
game.add_available_game_variable(GameVariable.USER3)
game.add_available_game_variable(GameVariable.USER4)
game.add_available_game_variable(GameVariable.USER5)
game.add_available_game_variable(GameVariable.USER6)
game.add_available_game_variable(GameVariable.USER7)
game.add_available_game_variable(GameVariable.USER8)
game.add_available_game_variable(GameVariable.USER9)
game.add_available_game_variable(GameVariable.USER10)
#Instantiate the game
game.init()
#return game, possible_actions
return game
def _reset(self):
self._game.new_episode()
self._stacked_frames = np.zeros((self._frame_stack_size, 160, 260, 3), dtype=np.float32)
self._doom_vars = [0] * 9
t_step = time_step.restart(self._create_observation())
return t_step
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _create_observation(self):
screen = self.stack_frames()
observation_spec = screen
return observation_spec
def _step(self, selected_action):
if (self._game.is_episode_finished()):
return self.reset()
repeating_tics = 1
#Creates a vector with all possible actions set to 0, then set the selected action to 1.
action = [0] * self._num_actions
action[selected_action] = 1
#Makes action, receives reward.
reward = self._game.make_action(action, repeating_tics)
if (self._game.is_episode_finished()):
return time_step.termination(self._create_observation(), reward)
else:
return time_step.transition(self._create_observation(), reward)
def stack_frames(self):
new_frame = self.preprocess_frame()
if self._game.is_new_episode():
for frame in range(self._frame_stack_size):
self._stacked_frames[frame] = new_frame
else:
for frame in range((self._frame_stack_size) - 1):
self._stacked_frames[frame] = self._stacked_frames[frame + 1]
self._stacked_frames[self._frame_stack_size - 1] = new_frame
return self._stacked_frames
def preprocess_frame(self):
"""
Preprocess frame before stacking it:
- Region-of-interest (ROI) selected from the original frame.
Frame is cut by 40 pixels up and down, and 30 pixels for left and right.
- Normalize images to interval [0,1]
"""
frame = self.get_screen_buffer_frame()
if (self._screen_type == 0):
frame = self.get_screen_buffer_frame()
elif (self._screen_type == 1):
frame = self.get_label_buffer_frame()
roi = frame[40:-40, 30:-30]
roi_normal = np.divide(roi, 255, dtype=np.float32)
return roi_normal
def get_screen_buffer_frame(self):
""" Get the current game screen buffer or an empty screen buffer if episode is finished"""
if (self._game.is_episode_finished()):
return np.zeros((240, 320, 3), dtype=np.float32)
else:
return self._game.get_state().screen_buffer
def get_label_buffer_frame(self):
""" Get the current game label screen buffer or an empty screen buffer if episode is finished"""
if (self._game.is_episode_finished()):
return np.zeros((240, 320, 3), dtype=np.float32)
else:
return self._game.get_state().labels_buffer
def render(self, mode='rgb_array'):
""" Return game frame for rendering. """
return self.get_screen_buffer_frame()
initial_collect_steps = 1000 # @param {type:"integer"}
max_steps_per_episode = 100 # @param {type:"integer"} #4200
number_of_episodes = 500 # @param {type:"integer"} #10000
number_of_epochs = 3 # @param {type:"integer"} #10
batch_size = 1 # @param {type:"integer"}
replay_buffer_max_size = 10000 # @param {type: "integer"}
learning_rate = 5e-4 # @param {type:"number"}
epsilon = 1e-5 # @param {type:"number"}
discount_factor = 0.995 # @param {type:"number"}
def create_networks(observation_spec, action_spec):
actor_net = ActorDistributionRnnNetwork(
observation_spec, action_spec,
conv_layer_params = [(16, 8, 4), (32, 4, 2)],
input_fc_layer_params = (256,),
lstm_size = (256,),
output_fc_layer_params = (128,),
activation_fn = tf.nn.elu)
value_net = ValueRnnNetwork(
observation_spec,
conv_layer_params = [(16, 8, 4), (32, 4, 2)],
input_fc_layer_params = (256,),
lstm_size = (256,),
output_fc_layer_params = (128,),
activation_fn = tf.nn.elu)
return actor_net, value_net
def compute_average_return (environment, policy, number_of_episodes=10):
total_return = 0.0
for episode in range(number_of_episodes):
time_step = environment.reset()
episode_return = 0.0
step = 0
while not (time_step.is_last() or step >= 100):
action_step = policy.action(time_step)
time_step = environment.step(action_step.action)
episode_return += time_step.reward #episode_return += time_step.reward
step += 1
total_return += episode_return
average_return = total_return/number_of_episodes
return average_return.numpy()[0]
##This is the most similar bit to the one that is producing the error. This part works ok and is printing out different values every time I run this code
metrics_doom_env = DoomEnvironment()
metrics_env = tf_py_environment.TFPyEnvironment(metrics_doom_env)
random_metrics_policy = random_tf_policy.RandomTFPolicy(metrics_env.time_step_spec(), metrics_env.action_spec())
compute_average_return (metrics_env, random_metrics_policy, 10)
env = DoomEnvironment()
tf_env = tf_py_environment.TFPyEnvironment(env)
evaluation_tf_env = tf_py_environment.TFPyEnvironment(env)
actor_net, value_net = create_networks(tf_env.observation_spec(), tf_env.action_spec())
global_step = tf.compat.v1.train.get_or_create_global_step()
#optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = learning_rate, epsilon = epsilon)
agent = ppo_agent.PPOAgent(
time_step_spec = tf_env.time_step_spec(),
action_spec = tf_env.action_spec(),
actor_net = actor_net,
value_net = value_net,
optimizer = optimizer,
num_epochs = number_of_epochs,
train_step_counter = global_step,
discount_factor = discount_factor,
gradient_clipping = 0.5,
entropy_regularization = 1e-2,
importance_ratio_clipping = 0.2,
use_gae = True,
use_td_lambda_return = True)
agent.initialize()
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=agent.collect_data_spec,
batch_size=tf_env.batch_size,
max_length=replay_buffer_max_size)
agent.collect_data_spec
#This is not code, it is the output of the previous code block! I thought it might be helpful.
_TupleWrapper(Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='selected_action', minimum=array(0, dtype=int32), maximum=array(4, dtype=int32)),
'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
'observation': BoundedTensorSpec(shape=(1, 160, 260, 3), dtype=tf.float32, name='screen_observation', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
'policy_info': {'dist_params': {'logits': TensorSpec(shape=(5,), dtype=tf.float32, name='CategoricalProjectionNetwork_logits')}},
'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}))
def collect_step(environment, policy, buffer):
t_step = environment.current_time_step()
action_step = policy.action(t_step)
next_t_step = environment.step(action_step.action)
trajectory_input = trajectory.from_transition(t_step, action_step, next_t_step)
buffer.add_batch(trajectory_input)
return
def collect_data(environment, policy, buffer, steps):
for step in range(steps):
collect_step(environment, policy, buffer)
data_doom_env = DoomEnvironment()
data_env = tf_py_environment.TFPyEnvironment(data_doom_env)
random_data_policy = random_tf_policy.RandomTFPolicy(data_env.time_step_spec(), data_env.action_spec())
collect_data(data_env, random_data_policy, replay_buffer, initial_collect_steps)
我没有进一步的代码(教程继续),因为我的应用程序在这里中断了。
我认为RandomTFPolicy返回的是没有'policy_info': DictWrapper({'dist_params': DictWrapper({'logits': .})}),
的轨迹
也许你应该用emit_log_probability=True
:
random_data_policy = random_tf_policy.RandomTFPolicy(
data_env.time_step_spec(), data_env.action_spec(), emit_log_probability=True)
基于这一部分,我希望它能输出策略信息。否则,您可能需要更深入地研究RandomTFPolicy
。