我为一个交易机器人(RL(的例子创建了一个自定义环境。
在培训期间,我想通过使用TensorBoard来检查结果,但我看到的只是一些指标,特别是:
-----------------------------------------
| time/ | |
| fps | 711 |
| iterations | 2 |
| time_elapsed | 5 |
| total_timesteps | 4096 |
| train/ | |
| approx_kl | 0.011529377 |
| clip_fraction | 0.0534 |
| clip_range | 0.2 |
| entropy_loss | -1.09 |
| explained_variance | 0.0319 |
| learning_rate | 0.0003 |
| loss | 0.0119 |
| n_updates | 10 |
| policy_gradient_loss | -0.00402 |
| value_loss | 0.0277 |
-----------------------------------------
根据这个(https://medium.com/aureliantactics/understanding-ppo-plots-in-tensorboard-cbc3199b9ba2)我预计会有更多的指标,特别是关于奖励的指标,如rollout/ep_lean_mean
和rollout/ep_rew_mean
这是我的代码:
import gym
from gym import spaces
class customEnv(gym.Env):
"""Custom Environment that follows gym interface"""
metadata = {'render.modes': ['human']}
def __init__(self, df, initial_balance=100, lookback_window_size=50, Render_range=100,):
super(customEnv, self).__init__()
self.df = df.reset_index()
self.fees = .998
self.initial_balance = initial_balance
self.lookback_window_size = lookback_window_size
self.df_total_steps = len(self.df)-1
self.orders_history = deque(maxlen=self.lookback_window_size)
self.columns = list(self.df.columns[1:])
self.Render_range = Render_range
# Market history contains the OHCL values for the last lookback_window_size prices
self.market_history = deque(maxlen=self.lookback_window_size)
# Define action and observation space
# They must be gym.spaces objects
# Example when using discrete actions:
self.action_space = spaces.Discrete(3)
# Example for using image as input:
self.observation_space = spaces.Box(low= -np.inf,high=np.inf,
shape=(self.lookback_window_size,len(self.columns) + 5),
dtype= np.float64)
def reset(self,env_steps_size = 0 ):
#self.visualization = TradingGraph(Render_range=self.Render_range, Show_reward=self.Show_reward, Show_indicators=self.Show_indicators) # init visualization
#self.trades = deque(maxlen=self.Render_range) # limited orders memory for visualization
#print("RESET")
self.balance = self.initial_balance
self.net_worth = self.initial_balance
self.prev_net_worth = self.initial_balance
self.crypto_held = 0
self.crypto_sold = 0
self.crypto_bought = 0
self.episode_orders = 0 # track episode orders count
self.prev_episode_orders = 0 # track previous episode orders count
self.rewards = deque(maxlen=self.Render_range)
self.env_steps_size = env_steps_size
self.punish_value = 0
self.trades = deque(maxlen=self.Render_range)
if env_steps_size > 0: # used for training dataset
self.start_step = random.randint(self.lookback_window_size, self.df_total_steps - env_steps_size)
self.end_step = self.start_step + env_steps_size
else: # used for testing dataset
self.start_step = self.lookback_window_size
self.end_step = self.df_total_steps
self.current_step = self.start_step
for i in reversed(range(self.lookback_window_size)):
current_step = self.current_step - i
self.orders_history.append([self.balance, #/ self.normalize_value,
self.net_worth,# / self.normalize_value,
self.crypto_bought,# / self.normalize_value,
self.crypto_sold,# / self.normalize_value,
self.crypto_held# / self.normalize_value
])
# one line for loop to fill market history withing reset call
self.market_history.append([self.df.loc[current_step, column] for column in self.columns])
state = np.concatenate((self.orders_history, self.market_history), axis=1)
#print(f"END RESET: {state.shape} - {np.isnan(state).sum()}")
return state
def step(self, action, production = False):
#print("STEP")
self.crypto_bought = 0
self.crypto_sold = 0
self.current_step += 1
# Set the current price to a random price between open and close
#current_price = random.uniform(
# self.df.loc[self.current_step, 'Open'],
# self.df.loc[self.current_step,'Close'])
current_price = self.df.loc[self.current_step, 'Open']
Date = self.df.loc[self.current_step, 'Date'] # for visualization
High = self.df.loc[self.current_step, 'High'] # for visualization
Low = self.df.loc[self.current_step, 'Low'] # for visualization
if action == 0: # Hold
pass
elif action == 1 and self.balance > self.initial_balance*0.05:
# Buy with 100% of current balance
self.crypto_bought = self.balance / current_price
self.crypto_bought *= (1-self.fees) # substract fees
self.balance -= self.crypto_bought * current_price
self.crypto_held += self.crypto_bought
self.trades.append({'Date' : Date, 'High' : High, 'Low' : Low, 'total': self.crypto_bought, 'type': "buy", 'current_price': current_price})
self.episode_orders += 1
elif action == 2 and self.crypto_held*current_price> self.initial_balance*0.05:
# Sell 100% of current crypto held
self.crypto_sold = self.crypto_held
self.crypto_sold *= (1-self.fees) # substract fees
self.balance += self.crypto_sold * current_price
self.crypto_held -= self.crypto_sold
self.trades.append({'Date' : Date, 'High' : High, 'Low' : Low, 'total': self.crypto_sold, 'type': "sell", 'current_price': current_price})
self.episode_orders += 1
self.prev_net_worth = self.net_worth
self.net_worth = self.balance + self.crypto_held * current_price
self.orders_history.append([self.balance ,#/ self.normalize_value,
self.net_worth,# / self.normalize_value,
self.crypto_bought,# / self.normalize_value,
self.crypto_sold,# / self.normalize_value,
self.crypto_held# / self.normalize_value
])
# Receive calculated reward
reward = self.get_reward()
if self.net_worth <= self.initial_balance*0.9:
done = True
else:
done = False
obs = self.next_observation(production)
#print(reward,action)
return obs, reward, done, {}
# Get the data points for the given current_step
def next_observation(self,production):
if(not production):
self.market_history.append([self.df.loc[self.current_step, column] for column in self.columns])
obs = np.concatenate((self.orders_history, self.market_history), axis=1)
return obs
# Calculate reward
def get_reward(self):
if self.episode_orders > 1 and self.episode_orders > self.prev_episode_orders:
self.prev_episode_orders = self.episode_orders
if self.trades[-1]['type'] == "buy" and self.trades[-2]['type'] == "sell":
reward = self.trades[-2]['total']*self.trades[-2]['current_price'] - self.trades[-2]['total']*self.trades[-1]['current_price']
self.trades[-1]["Reward"] = reward
return reward
elif self.trades[-1]['type'] == "sell" and self.trades[-2]['type'] == "buy":
reward = self.trades[-1]['total']*self.trades[-1]['current_price'] - self.trades[-2]['total']*self.trades[-2]['current_price']
self.trades[-1]["Reward"] = reward
return reward
#elif self.trades[-1]['type'] == "sell" and self.trades[-2]['type'] == "sell":
# return -100
#elif self.trades[-1]['type'] == "buy" and self.trades[-2]['type'] == "buy":
# return -100
else:
return 0
else:
return 0
def render(self, mode='human', close=False):
profit = self.net_worth - self.initial_balance
print(f'Step: {self.current_step}')
print(f'Balance: {self.balance}')
print(f'Crypto held: {self.crypto_held}')
print(f'Profit: {profit}')
# Render the environment to the screen
from stable_baselines3 import A2C,PPO
env.reset()
model = PPO("MlpPolicy",env,verbose= 1,tensorboard_log= logdir)
TIMESTEPS = 10000
for i in range(1,10):
model.learn(total_timesteps= TIMESTEPS*i,reset_num_timesteps=False,tb_log_name=kind)
model.save(f"{models_dir}/{TIMESTEPS*i}")
如何显示更多指标?
非常感谢
我找到了解决方案,env变量必须封装到Monitor:中
from stable_baselines3.common.monitor import Monitor
env = Monitor(env, logdir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])
我希望这能帮助