如何在没有健身房的情况下使用自己的环境进行DDPG



我正在使用Keras构建一个ddpg模型,我按照官方指示从这里开始,在这里输入链接描述

但我想要自己的环境,而不是健身房,这是我自己的环境:

class Environment1:
def __init__(self, data, history_t=90):
self.data = data
self.history_t = history_t
self.reset ()
def reset(self):
self.t = 0
self.done = False
self.profits = 0
self.positions = []
self.position_value = 0
self.history = [0 for _ in range (self.history_t)]
return [self.position_value] + self.history  # obs
def step(self, act):
reward = 0
# act = 0: stay, 1: buy, 2: sell
if act == 1:
self.positions.append (self.data.iloc[self.t, :]['close'])
elif act == 2:  # sell
if len (self.positions) == 0:
reward = -1
else:
profits = 0
for p in self.positions:
profits += (self.data.iloc[self.t, :]['close'] - p)
reward += profits
self.profits += profits
self.positions = []
# set next time
self.t += 1
self.position_value = 0
for p in self.positions:
self.position_value += (self.data.iloc[self.t, :]['close'] - p)
self.history.pop (0)
self.history.append (self.data.iloc[self.t, :]['close'] - self.data.iloc[(self.t - 1), :]['close'])
# clipping reward
if reward > 0:
reward = 1
elif reward < 0:
reward = -1
return [self.position_value] + self.history, reward, self.done  # obs, reward, done

env = Environment1 (train)
print (env.reset ())
for _ in range (3):
pact = np.random.randint (3)
print (env.step (pact))  

当我如上所述使用自己的环境时,出现了一个错误:

AttributeError                            Traceback (most recent call last)
<ipython-input-1-a51b38095bf0> in <module>
179 # env = gym.make(problem)
180 
--> 181 num_states = env.observation_space.shape[0]
182 print("Size of State Space ->  {}".format(num_states))
183 num_actions = env.action_space.shape[0]
AttributeError: 'Environment1' object has no attribute 'observation_space'

以下是整个代码:

import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))
import time
import copy
import numpy as np
import pandas as pd

data = pd.read_csv (r'C:UserswilliDownloadsspyv.csv')
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

date_split = 377
train = data[:date_split]
test = data[date_split:]

class Environment1:
def __init__(self, data, history_t=90):
self.data = data
self.history_t = history_t
self.reset ()
def reset(self):
self.t = 0
self.done = False
self.profits = 0
self.positions = []
self.position_value = 0
self.history = [0 for _ in range (self.history_t)]
return [self.position_value] + self.history  # obs
def step(self, act):
reward = 0
# act = 0: stay, 1: buy, 2: sell
if act == 1:
self.positions.append (self.data.iloc[self.t, :]['close'])
elif act == 2:  # sell
if len (self.positions) == 0:
reward = -1
else:
profits = 0
for p in self.positions:
profits += (self.data.iloc[self.t, :]['close'] - p)
reward += profits
self.profits += profits
self.positions = []
# set next time
self.t += 1
self.position_value = 0
for p in self.positions:
self.position_value += (self.data.iloc[self.t, :]['close'] - p)
self.history.pop (0)
self.history.append (self.data.iloc[self.t, :]['close'] - self.data.iloc[(self.t - 1), :]['close'])
# clipping reward
if reward > 0:
reward = 1
elif reward < 0:
reward = -1
return [self.position_value] + self.history, reward, self.done  # obs, reward, done

env = Environment1 (train)
print (env.reset ())
for _ in range (3):
pact = np.random.randint (3)
print (env.step (pact))
#above here are all my own code, under here is the code from keras
num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))
upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]
print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

class OUActionNoise:
def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
self.theta = theta
self.mean = mean
self.std_dev = std_deviation
self.dt = dt
self.x_initial = x_initial
self.reset()
def __call__(self):
# Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
x = (
self.x_prev
+ self.theta * (self.mean - self.x_prev) * self.dt
+ self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
)
# Store x into x_prev
# Makes next noise dependent on current one
self.x_prev = x
return x
def reset(self):
if self.x_initial is not None:
self.x_prev = self.x_initial
else:
self.x_prev = np.zeros_like(self.mean)


class Buffer:
def __init__(self, buffer_capacity=100000, batch_size=64):
# Number of "experiences" to store at max
self.buffer_capacity = buffer_capacity
# Num of tuples to train on.
self.batch_size = batch_size
# Its tells us num of times record() was called.
self.buffer_counter = 0
# Instead of list of tuples as the exp.replay concept go
# We use different np.arrays for each tuple element
self.state_buffer = np.zeros((self.buffer_capacity, num_states))
self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
self.reward_buffer = np.zeros((self.buffer_capacity, 1))
self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))
# Takes (s,a,r,s') obervation tuple as input
def record(self, obs_tuple):
# Set index to zero if buffer_capacity is exceeded,
# replacing old records
index = self.buffer_counter % self.buffer_capacity
self.state_buffer[index] = obs_tuple[0]
self.action_buffer[index] = obs_tuple[1]
self.reward_buffer[index] = obs_tuple[2]
self.next_state_buffer[index] = obs_tuple[3]
self.buffer_counter += 1
# Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
# TensorFlow to build a static graph out of the logic and computations in our function.
# This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
@tf.function
def update(
self, state_batch, action_batch, reward_batch, next_state_batch,
):
# Training and updating Actor & Critic networks.
# See Pseudo Code.
with tf.GradientTape() as tape:
target_actions = target_actor(next_state_batch, training=True)
y = reward_batch + gamma * target_critic(
[next_state_batch, target_actions], training=True
)
critic_value = critic_model([state_batch, action_batch], training=True)
critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
critic_optimizer.apply_gradients(
zip(critic_grad, critic_model.trainable_variables)
)
with tf.GradientTape() as tape:
actions = actor_model(state_batch, training=True)
critic_value = critic_model([state_batch, actions], training=True)
# Used `-value` as we want to maximize the value given
# by the critic for our actions
actor_loss = -tf.math.reduce_mean(critic_value)
actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
actor_optimizer.apply_gradients(
zip(actor_grad, actor_model.trainable_variables)
)
# We compute the loss and update parameters
def learn(self):
# Get sampling range
record_range = min(self.buffer_counter, self.buffer_capacity)
# Randomly sample indices
batch_indices = np.random.choice(record_range, self.batch_size)
# Convert to tensors
state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
reward_batch = tf.cast(reward_batch, dtype=tf.float32)
next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
self.update(state_batch, action_batch, reward_batch, next_state_batch)

# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def update_target(target_weights, weights, tau):
for (a, b) in zip(target_weights, weights):
a.assign(b * tau + a * (1 - tau))


def get_actor():
# Initialize weights between -3e-3 and 3-e3
last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
inputs = layers.Input(shape=(num_states,))
out = layers.Dense(256, activation="relu")(inputs)
out = layers.Dense(256, activation="relu")(out)
outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)
# Our upper bound is 2.0 for Pendulum.
outputs = outputs * upper_bound
model = tf.keras.Model(inputs, outputs)
return model

def get_critic():
# State as input
state_input = layers.Input(shape=(num_states))
state_out = layers.Dense(16, activation="relu")(state_input)
state_out = layers.Dense(32, activation="relu")(state_out)
# Action as input
action_input = layers.Input(shape=(num_actions))
action_out = layers.Dense(32, activation="relu")(action_input)
# Both are passed through seperate layer before concatenating
concat = layers.Concatenate()([state_out, action_out])
out = layers.Dense(256, activation="relu")(concat)
out = layers.Dense(256, activation="relu")(out)
outputs = layers.Dense(1)(out)
# Outputs single value for give state-action
model = tf.keras.Model([state_input, action_input], outputs)
return model

def policy(state, noise_object):
sampled_actions = tf.squeeze(actor_model(state))
noise = noise_object()
# Adding noise to action
sampled_actions = sampled_actions.numpy() + noise
# We make sure action is within bounds
legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
return [np.squeeze(legal_action)]
std_dev = 0.2
ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
actor_model = get_actor()
critic_model = get_critic()
target_actor = get_actor()
target_critic = get_critic()
# Making the weights equal initially
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())
# Learning rate for actor-critic models
critic_lr = 0.002
actor_lr = 0.001
critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
total_episodes = 100
# Discount factor for future rewards
gamma = 0.99
# Used to update target networks
tau = 0.005
buffer = Buffer(50000, 64)

# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []
# Takes about 4 min to train
for ep in range(total_episodes):
prev_state = env.reset()
episodic_reward = 0
while True:
# Uncomment this to see the Actor in action
# But not in a python notebook.
# env.render()
tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
action = policy(tf_prev_state, ou_noise)
# Recieve state and reward from environment.
state, reward, done, info = env.step(action)
buffer.record((prev_state, action, reward, state))
episodic_reward += reward
buffer.learn()
update_target(target_actor.variables, actor_model.variables, tau)
update_target(target_critic.variables, critic_model.variables, tau)
# End this episode when `done` is True
if done:
break
prev_state = state
ep_reward_list.append(episodic_reward)
# Mean of last 40 episodes
avg_reward = np.mean(ep_reward_list[-40:])
print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
avg_reward_list.append(avg_reward)
# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()

# Save the weights
actor_model.save_weights("pendulum_actor.h5")
critic_model.save_weights("pendulum_critic.h5")
target_actor.save_weights("pendulum_target_actor.h5")
target_critic.save_weights("pendulum_target_critic.h5")

在我运行完整个代码后出现错误:

AttributeError: 'Environment1' object has no attribute 'observation_space'

有朋友可以帮忙吗?这对我来说真的很难。

您的Environment1类没有observation_space属性。因此,要解决这个问题,你可以通过查看文档,使用OpenAI健身房来定义它。如果您不想定义它,那么您也可以更改DDPG代码中的以下行:

num_states = my_num_states # instead of env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = my_num_actions # instead of env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))
upper_bound = my_actions_max # instead of env.action_space.high[0]
lower_bound = my_actions_min # instead of env.action_space.low[0]

这里,my_num_states是你的状态向量的维度,my_num_actions是你的动作向量的维度。my_action_max是你的行动空间中的最大值,my_action_min是你的行为空间中的最小值。

最新更新