为什么ep_rew_mean随时间减少?

为了学习关于优化的强化学习，我写了一些代码来尝试在图中找到最大基数匹配。它不仅不能很好地工作，当我设置它使done = True经过1000步后，ep_rew_mean实际上随着时间的推移而减少。情况越来越糟。下面的代码你应该可以直接复制粘贴:

import igraph as ig
from tqdm import tqdm
import numpy as np
import gym
from gym import spaces
from math import exp
import random
from sb3_contrib import TRPO
from stable_baselines3 import PPO, DQN,  A2C
from stable_baselines3.common.env_util import make_vec_env
class MaxMatchEnv(gym.Env):
metadata = {'render.modes': ['console']}
def __init__(self, array_length=10):
super(MaxMatchEnv, self).__init__()
# Size of the 1D-grid
self.array_length = array_length
self.agent_pos = [0]*array_length
self.action_space = spaces.Discrete(array_length)
self.observation_space = spaces.Box(low=0, high=1, shape=(array_length,), dtype=np.uint8)
self.matching = set()  # set of edges
self.matching_nodes = set() # set of node ids (ints)
self.matching_size = len([v for v in g_matching.matching if v < num_variables and v != -1])
self.best_found = 0

def reset(self):
# Initialize the array to have random values
self.time = 0
#print(self.agent_pos)
self.agent_pos = [0]*self.array_length
self.matching = set()
self.matching_nodes = set()
return np.array(self.agent_pos)


def step(self, action):
self.time += 1
#print("Current state:", self.agent_pos)
#print("Edges included", self.matching_nodes)
assert(2 * sum(self.agent_pos) == len(self.matching_nodes)) # should be twice as many nodes in the matching as edges
reward = 0
edge = g.es[action]
if not(edge.source in self.matching_nodes or edge.target in self.matching_nodes):
#print("Adding edge", action)
self.matching.add(edge)
self.matching_nodes.add(edge.source)
self.matching_nodes.add(edge.target)
self.agent_pos[action] = 1
if sum(self.agent_pos) > self.best_found:
self.best_found = sum(self.agent_pos)
print(max(env.get_attr("best_found")), end=" ")
reward = 1
elif self.agent_pos[action] == 1:
#print("Removing edge", action)
self.matching_nodes.remove(edge.source)
self.matching_nodes.remove(edge.target)
self.matching.remove(edge)
self.agent_pos[action] = 0
reward = -1
if self.time == 1000:
done = True
else:
done = False
# done = sum(self.agent_pos) == self.matching_size
info = {}
return np.array(self.agent_pos), reward, done, info
def render(self, mode='console'):
print(sum(self.agent_pos))
def close(self):
pass

if __name__ == '__main__':
random.seed(7)
num_variables = 1000
g = ig.Graph.Random_Bipartite(num_variables, num_variables, p=3/num_variables)
g_matching = g.maximum_bipartite_matching()
print("Matching size", len([v for v in g_matching.matching if v < num_variables and v != -1]))
env = make_vec_env(lambda: MaxMatchEnv(array_length=len(g.es)), n_envs=12)
model = PPO('MlpPolicy', env, verbose=1).learn(10000000)

输出:

----
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 399      |
| time/              |          |
|    fps             | 3071     |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 24576    |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 347        |
| time/                   |            |
|    fps                  | 927        |
|    iterations           | 2          |
|    time_elapsed         | 52         |
|    total_timesteps      | 49152      |
| train/                  |            |
|    approx_kl            | 0.13847528 |
|    clip_fraction        | 0.646      |
|    clip_range           | 0.2        |
|    entropy_loss         | -7.19      |
|    explained_variance   | 0.00317    |
|    learning_rate        | 0.0003     |
|    loss                 | 0.307      |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.125     |
|    value_loss           | 2.81       |
----------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1e+03     |
|    ep_rew_mean          | 306       |
| time/                   |           |
|    fps                  | 740       |
|    iterations           | 3         |
|    time_elapsed         | 99        |
|    total_timesteps      | 73728     |
| train/                  |           |
|    approx_kl            | 0.1270247 |
|    clip_fraction        | 0.765     |
|    clip_range           | 0.2       |
|    entropy_loss         | -4.77     |
|    explained_variance   | 0.604     |
|    learning_rate        | 0.0003    |
|    loss                 | 0.0528    |
|    n_updates            | 20        |
|    policy_gradient_loss | -0.124    |
|    value_loss           | 1.16      |
---------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 279        |
| time/                   |            |
|    fps                  | 671        |
|    iterations           | 4          |
|    time_elapsed         | 146        |
|    total_timesteps      | 98304      |
| train/                  |            |
|    approx_kl            | 0.09246389 |
|    clip_fraction        | 0.69       |
|    clip_range           | 0.2        |
|    entropy_loss         | -4.13      |
|    explained_variance   | 0.875      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0912     |
|    n_updates            | 30         |
|    policy_gradient_loss | -0.104     |
|    value_loss           | 0.757      |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1e+03      |
|    ep_rew_mean          | 227        |
| time/                   |            |
|    fps                  | 631        |
|    iterations           | 5          |
|    time_elapsed         | 194        |
|    total_timesteps      | 122880     |
| train/                  |            |
|    approx_kl            | 0.07252013 |
|    clip_fraction        | 0.596      |
|    clip_range           | 0.2        |
|    entropy_loss         | -3.67      |
|    explained_variance   | 0.944      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0491     |
|    n_updates            | 40         |
|    policy_gradient_loss | -0.0927    |
|    value_loss           | 0.61       |
----------------------------------------

为什么会发生这种情况?

您可以尝试添加:

if self.time == 1000:
done = True
self.reset()

目前MaxMatchEnv没有重置，这可能意味着在新的模拟中获得奖励变得更加困难，因为它们在初始状态不同时不可比较。

如果这确实解决了这个问题，您也可以尝试增加时间，以确定是否在后续剧集中给模型更多的时间有帮助。

编辑:您还需要在您的环境reset方法中添加self.best_found = 0。

相关内容

最新更新

热门标签：