Tensorboard中的更多指标



我为一个交易机器人(RL(的例子创建了一个自定义环境。

在培训期间,我想通过使用TensorBoard来检查结果,但我看到的只是一些指标,特别是:

-----------------------------------------
| time/                   |             |
|    fps                  | 711         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011529377 |
|    clip_fraction        | 0.0534      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.0319      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0119      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00402    |
|    value_loss           | 0.0277      |
-----------------------------------------

根据这个(https://medium.com/aureliantactics/understanding-ppo-plots-in-tensorboard-cbc3199b9ba2)我预计会有更多的指标,特别是关于奖励的指标,如rollout/ep_lean_meanrollout/ep_rew_mean

这是我的代码:


import gym 
from gym import spaces 

class customEnv(gym.Env): 
"""Custom Environment that follows gym interface""" 
metadata = {'render.modes': ['human']} 

def __init__(self, df, initial_balance=100, lookback_window_size=50, Render_range=100,): 
super(customEnv, self).__init__() 
self.df = df.reset_index() 
self.fees = .998 
self.initial_balance = initial_balance 
self.lookback_window_size = lookback_window_size 
self.df_total_steps = len(self.df)-1 
self.orders_history = deque(maxlen=self.lookback_window_size) 
self.columns = list(self.df.columns[1:]) 
self.Render_range = Render_range 
# Market history contains the OHCL values for the last lookback_window_size prices 
self.market_history = deque(maxlen=self.lookback_window_size) 
# Define action and observation space 
# They must be gym.spaces objects 
# Example when using discrete actions: 
self.action_space = spaces.Discrete(3) 
# Example for using image as input: 
self.observation_space = spaces.Box(low= -np.inf,high=np.inf, 
shape=(self.lookback_window_size,len(self.columns) + 5), 
dtype= np.float64) 


def reset(self,env_steps_size = 0 ):
#self.visualization = TradingGraph(Render_range=self.Render_range, Show_reward=self.Show_reward, Show_indicators=self.Show_indicators) # init visualization 
#self.trades = deque(maxlen=self.Render_range) # limited orders memory for visualization 
#print("RESET")
self.balance = self.initial_balance 
self.net_worth = self.initial_balance 
self.prev_net_worth = self.initial_balance 
self.crypto_held = 0 
self.crypto_sold = 0 
self.crypto_bought = 0 
self.episode_orders = 0 # track episode orders count 
self.prev_episode_orders = 0 # track previous episode orders count 
self.rewards = deque(maxlen=self.Render_range) 
self.env_steps_size = env_steps_size 
self.punish_value = 0 
self.trades = deque(maxlen=self.Render_range) 
if env_steps_size > 0: # used for training dataset 
self.start_step = random.randint(self.lookback_window_size, self.df_total_steps - env_steps_size) 
self.end_step = self.start_step + env_steps_size 
else: # used for testing dataset 
self.start_step = self.lookback_window_size 
self.end_step = self.df_total_steps 

self.current_step = self.start_step 

for i in reversed(range(self.lookback_window_size)): 
current_step = self.current_step - i 
self.orders_history.append([self.balance, #/ self.normalize_value, 
self.net_worth,# / self.normalize_value, 
self.crypto_bought,# / self.normalize_value, 
self.crypto_sold,# / self.normalize_value, 
self.crypto_held# / self.normalize_value 
]) 

# one line for loop to fill market history withing reset call 
self.market_history.append([self.df.loc[current_step, column] for column in self.columns]) 

state = np.concatenate((self.orders_history, self.market_history), axis=1) 
#print(f"END RESET: {state.shape} - {np.isnan(state).sum()}")
return state 
def step(self, action, production = False): 
#print("STEP")
self.crypto_bought = 0 
self.crypto_sold = 0 
self.current_step += 1 

# Set the current price to a random price between open and close 
#current_price = random.uniform( 
#    self.df.loc[self.current_step, 'Open'], 
#    self.df.loc[self.current_step,'Close']) 
current_price = self.df.loc[self.current_step, 'Open'] 
Date = self.df.loc[self.current_step, 'Date'] # for visualization 
High = self.df.loc[self.current_step, 'High'] # for visualization 
Low = self.df.loc[self.current_step, 'Low'] # for visualization 

if action == 0: # Hold 
pass 

elif action == 1 and self.balance > self.initial_balance*0.05: 
# Buy with 100% of current balance 
self.crypto_bought = self.balance / current_price 
self.crypto_bought *= (1-self.fees) # substract fees 
self.balance -= self.crypto_bought * current_price 
self.crypto_held += self.crypto_bought 
self.trades.append({'Date' : Date, 'High' : High, 'Low' : Low, 'total': self.crypto_bought, 'type': "buy", 'current_price': current_price}) 
self.episode_orders += 1 

elif action == 2 and self.crypto_held*current_price> self.initial_balance*0.05: 
# Sell 100% of current crypto held 
self.crypto_sold = self.crypto_held 
self.crypto_sold *= (1-self.fees) # substract fees 
self.balance += self.crypto_sold * current_price 
self.crypto_held -= self.crypto_sold 
self.trades.append({'Date' : Date, 'High' : High, 'Low' : Low, 'total': self.crypto_sold, 'type': "sell", 'current_price': current_price}) 
self.episode_orders += 1 

self.prev_net_worth = self.net_worth 
self.net_worth = self.balance + self.crypto_held * current_price 

self.orders_history.append([self.balance ,#/ self.normalize_value, 
self.net_worth,# / self.normalize_value, 
self.crypto_bought,# / self.normalize_value, 
self.crypto_sold,# / self.normalize_value, 
self.crypto_held# / self.normalize_value 
]) 

# Receive calculated reward 
reward = self.get_reward() 

if self.net_worth <= self.initial_balance*0.9: 
done = True 
else: 
done = False 

obs = self.next_observation(production) 
#print(reward,action)
return obs, reward, done, {} 

# Get the data points for the given current_step 
def next_observation(self,production): 
if(not production): 
self.market_history.append([self.df.loc[self.current_step, column] for column in self.columns]) 
obs = np.concatenate((self.orders_history, self.market_history), axis=1) 
return obs 

# Calculate reward 
def get_reward(self): 
if self.episode_orders > 1 and self.episode_orders > self.prev_episode_orders: 
self.prev_episode_orders = self.episode_orders 
if self.trades[-1]['type'] == "buy" and self.trades[-2]['type'] == "sell": 
reward = self.trades[-2]['total']*self.trades[-2]['current_price'] - self.trades[-2]['total']*self.trades[-1]['current_price'] 
self.trades[-1]["Reward"] = reward
return reward 
elif self.trades[-1]['type'] == "sell" and self.trades[-2]['type'] == "buy": 
reward = self.trades[-1]['total']*self.trades[-1]['current_price'] - self.trades[-2]['total']*self.trades[-2]['current_price'] 
self.trades[-1]["Reward"] = reward 
return reward
#elif self.trades[-1]['type'] == "sell" and self.trades[-2]['type'] == "sell": 
#    return -100
#elif self.trades[-1]['type'] == "buy" and self.trades[-2]['type'] == "buy": 
#    return -100
else:
return 0
else: 
return 0 


def render(self, mode='human', close=False): 
profit = self.net_worth - self.initial_balance 
print(f'Step: {self.current_step}') 
print(f'Balance: {self.balance}') 
print(f'Crypto held: {self.crypto_held}') 
print(f'Profit: {profit}') 
# Render the environment to the screen

from stable_baselines3 import A2C,PPO
env.reset()
model = PPO("MlpPolicy",env,verbose= 1,tensorboard_log= logdir)
TIMESTEPS = 10000
for i in range(1,10):
model.learn(total_timesteps= TIMESTEPS*i,reset_num_timesteps=False,tb_log_name=kind)
model.save(f"{models_dir}/{TIMESTEPS*i}")

如何显示更多指标?

非常感谢

我找到了解决方案,env变量必须封装到Monitor:中

from stable_baselines3.common.monitor import Monitor
env = Monitor(env, logdir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])

我希望这能帮助

相关内容

  • 没有找到相关文章

最新更新