Python稳定基线3-断言错误:"reset()"方法返回的观测值必须是int

我正在尝试学习强化学习，以在python中的自定义游戏中训练ai，并决定使用gym作为环境，使用稳定基线3进行训练。我决定从一个基本的井字游戏环境开始。这是我的代码

import gym
from gym import spaces
import numpy as np
from stable_baselines3.common.env_checker import check_env
class tictactoe(gym.Env):
def __init__(self):
#creating grid, action and obervation space
self.box = [0,0,0,0,0,0,0,0,0]
self.done=False
self.turn = 1
self.action_space = spaces.Discrete(9)
self.observation_space = spaces.Discrete(9)
def _get_obs(self):
#returns the observation (the grid)
return np.array(self.box)
def iswinner(self, b, l):
#function to check if a side has won
return (b[1] == l and b[2] == l and b[3] == l) or (b[4] == l and b[5] == l and b[6] == l) or (b[7] == l and b[8] == l and b[9] == l) or (b[1] == l and b[4] == l and b[7] == l) or (b[7] == l and b[5] == l and b[3] == l) or (b[1] == l and b[5] == l and b[9] == l) or (b[8] == l and b[5] == l and b[2] == l) or (b[9] == l and b[6] == l and b[3] == l)

def reset(self):
#resets the env (grid, turn and done variable) and returns the observation
self.box = [0,0,0,0,0,0,0,0,0]
self.turn = 1
self.done=False
return self._get_obs()
def step(self, action):
#gives negative reward for illegal move (square occupied)
if self.box[action] != 0:
return self._get_obs(), -10, True, {} 
#enters a value (1 or 2) in the grid and flips the turn
self.box[action] = self.turn
self.turn = (1 if self.turn == 2 else 2)
reward = 0
#checks if the game is over and sets a reward (+5 win, 0 draw)
if self.iswinner([0]+self.box,1) and self.turn == 1: reward,self.done = 5,True
elif 0 not in self.box: reward,self.done = 0,True
#returns the observation (grid), reward, if the game is finished and extra information (empty dict for me)
return self._get_obs(), reward, self.done, {}
def render(self):
#renders the board so it looks like a grid
print(self.box[:3],self.box[3:6],self.box[6:],sep='n')
#checking the env
env = tictactoe()
print(check_env(env))

尝试这个代码，我得到了错误AssertionError: The observation returned by 'reset()' method must be an int。我完全不明白这是怎么回事。由于我的reset函数返回来自_get_obs的观测。它是想说我的观测值一定是一个整数吗？这就更没有意义了，因为现在我不知道该怎么做。

进行时

self.observation_space = spaces.Discrete(9)

实际上，您将观察空间定义为一个可以接受{0, 1, 2, 3, 4, 5, 6, 7, 8}所有值的单个值，因为您将其定义为离散的一维空间(也称为整数(。

正如你所说的，你试图创造一个井字游戏环境，我想你实际上想做的是类似的事情

self.observation_space = spaces.MultiDiscrete([3, 3, 3, 3, 3, 3, 3, 3, 3])
# or self.observation_space = spaces.MultiDiscrete(9 * [3]), which would be cleaner

这意味着你总共有9个瓦片，每个瓦片可以处于三种不同的状态(空、X或O(。

相关内容

最新更新

热门标签：