我正在尝试构建一个强化学习算法,它可以玩MasterMind游戏。我正在使用多离散节点和观察空间。动作空间有 4 个插槽,每个插槽有 6 种颜色,观察空间是 2x4。我创建了一个自定义环境来连接我的编程游戏。由于发生错误,环境尚未就绪。也许有人可以帮助我解决这个问题。
import gym as gym
from gym import Env
from gym.spaces import Discrete, Box, MultiDiscrete, Dict
from stable_baselines3.common.policies import MultiInputActorCriticPolicy
action_space = MultiDiscrete(np.array([6,6,6,6]), dtype=int)
observation_space = MultiDiscrete(np.array([4,4]), dtype=int)
...
class MasterMindEnv(Env):
def __init__(self) -> None:
super(MasterMindEnv, self).__init__()
self.action_space = action_space
self.observation_space = observation_space
def step(self, action:np.ndarray):
pass_action(action)
output = get_output()
print(output)
reward = output[0] + output[1]
print(reward)
done = False
info = {}
return observation_space.sample(), 1, done, info
def reset(self):
return self.observation_space.sample()
...
model = A2C(MultiInputActorCriticPolicy, env)
model.learn(total_timesteps=1000)
错误是:
AttributeError Traceback (most recent call last)
c:...model.ipynb Zelle 10 in <module>
----> 1 model = A2C(MultiInputActorCriticPolicy, env)
2 model.learn(total_timesteps=1000)
File c:...Python310libsite-packagesstable_baselines3a2ca2c.py:126, in A2C.__init__(self, policy, env, learning_rate, n_steps, gamma, gae_lambda, ent_coef, vf_coef, max_grad_norm, rms_prop_eps, use_rms_prop, use_sde, sde_sample_freq, normalize_advantage, tensorboard_log, create_eval_env, policy_kwargs, verbose, seed, device, _init_setup_model)
123 self.policy_kwargs["optimizer_kwargs"] = dict(alpha=0.99, eps=rms_prop_eps, weight_decay=0)
125 if _init_setup_model:
--> 126 self._setup_model()
File c:...Python310libsite-packagesstable_baselines3commonon_policy_algorithm.py:123, in OnPolicyAlgorithm._setup_model(self)
112 buffer_cls = DictRolloutBuffer if isinstance(self.observation_space, gym.spaces.Dict) else RolloutBuffer
114 self.rollout_buffer = buffer_cls(
115 self.n_steps,
116 self.observation_space,
(...)
121 n_envs=self.n_envs,
122 )
--> 123 self.policy = self.policy_class( # pytype:disable=not-instantiable
...
--> 258 for key, subspace in observation_space.spaces.items():
259 if is_image_space(subspace):
260 extractors[key] = NatureCNN(subspace, features_dim=cnn_output_dim)
AttributeError: 'MultiDiscrete' object has no attribute 'spaces'
更新
class MasterMindEnv(Env):
def __init__(self) -> None:
super(MasterMindEnv, self).__init__()
self.action_space = MultiDiscrete(np.array([6,6,6,6]), dtype=int)
self.observation_space = MultiDiscrete(np.array([4,4]), dtype=int)
def step(self, action:np.ndarray):
output = observation_space.sample()
reward = output[0] + output[1]
done = False
if (reward == 8):
done = True
info = {}
return output, float(reward), done, info
def reset(self):
return self.observation_space.sample()
env = MasterMindEnv()
model = A2C("MlpPolicy", env)
model.learn(total_timesteps=1000)
它生成:
RuntimeError Traceback (most recent call last)
c:...model.ipynb Zelle 9 in <module>
1 model = A2C("MlpPolicy", env)
----> 2 model.learn(total_timesteps=1000)
File c:...Python310libsite-packagesstable_baselines3a2ca2c.py:203, in A2C.learn(self, total_timesteps, callback, log_interval, eval_env, eval_freq, n_eval_episodes, tb_log_name, eval_log_path, reset_num_timesteps, progress_bar)
189 def learn(
190 self: A2CSelf,
191 total_timesteps: int,
(...)
200 progress_bar: bool = False,
201 ) -> A2CSelf:
--> 203 return super().learn(
204 total_timesteps=total_timesteps,
205 callback=callback,
206 log_interval=log_interval,
207 eval_env=eval_env,
208 eval_freq=eval_freq,
209 n_eval_episodes=n_eval_episodes,
210 tb_log_name=tb_log_name,
211 eval_log_path=eval_log_path,
212 reset_num_timesteps=reset_num_timesteps,
213 progress_bar=progress_bar,
214 )
...
--> 464 return th.as_tensor(obs).to(device)
465 elif isinstance(obs, dict):
466 return {key: th.as_tensor(_obs).to(device) for (key, _obs) in obs.items()}
RuntimeError: Could not infer dtype of numpy.int32
observation_space = MultiDiscrete(np.array([4,4]), dtype=int)
...
model = A2C(MultiInputActorCriticPolicy, env)
...
for key, subspace in observation_space.spaces.items():
多离散空间不需要多输入。它仍然只是一个观察空间,而当提供多个观察空间时需要多输入。
不要使用多输入策略(例如使用ActorCriticPolicy
)或包装空格(例如使用spaces.Tuple
Stable Baselines3 supports handling of multiple inputs by using Dict Gym space.
This can be done using MultiInputPolicy, which by default uses the
CombinedExtractor feature extractor to turn multiple inputs into a single
vector, handled by the net_arch network.