py_environment 'time_step'与"time_step_spec"不匹配

我已经通过TF代理创建了一个自定义的Pyenvironment。但是我无法验证环境或使用py_policy.action采取措施我对time_step_specs

的除外的内容感到困惑

我尝试通过tf_py_environment.tfpyenvironment转换为tf_py_environment，并成功地使用TF_Policy采取了行动，但我仍然对区别感到困惑。

import abc
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.trajectories import time_step as ts
from tf_agents.policies import random_tf_policy
import tensorflow as tf
import tf_agents
class TicTacToe(py_environment.PyEnvironment):
   def __init__(self,n):
    super(TicTacToe,self).__init__()
    self.n = n
    self.winner = None
    self._episode_ended = False
    self.inital_state = np.zeros((n,n))
    self._state = self.inital_state
    self._observation_spec = array_spec.BoundedArraySpec(
        shape = (n,n),dtype='int32',minimum = -1,maximum = 1,name = 
'TicTacToe board state spec')
    self._action_spec = array_spec.BoundedArraySpec(
        shape = (),dtype = 'int32', minimum = 0,maximum = 8, name = 
'TicTacToe action spec')
def observation_spec(self):
    return self._observation_spec
def action_spec(self):
    return self._action_spec
def _reset(self):
    return ts.restart(self.inital_state)
def check_game_over(self):
    for i in range(self.n):
        if (sum(self._state[i,:])==self.n) or 
(sum(self._state[:,i])==self.n):
            self.winner = 1
            return True
        elif (sum(self._state[i,:])==-self.n) or 
    (sum(self._state[:,i])==-self.n):
            self.winner = -1
            return True
    if (self._state.trace()==self.n) or 
(self._state[::-1].trace()==self.n):
        self.winner = 1
        return True
    elif (self._state.trace()==-self.n) or (self._state[::-1].trace()==- 
   self.n):
        self.winner = -1
        return True
    if not (0 in self._state):
        return True
def _step(self,action):
    self._state[action//3,action%3]=1
    self._episode_ended = self.check_game_over
    if self._episode_ended==True:
        if self.winner == 1:
            reward = 1
        elif self.winner == None:
            reward = 0
        else:
            reward = -1
        return ts.termination(self._state,dtype = 'int32',reward=reward)
    else:
        return ts.transition(self._state,dtype = 'int32',reward = 
0.0,discount = 0.9)
env = TicTacToe(3)
utils.validate_py_environment(env, episodes=5)

这是我遇到的错误：

value error trackback(最近的最新电话( 在----> 1 utils.validate_py_environment(env，情节= 5(

c： user bzhang appdata local continuum anaconda3 lib lib site-packages tf_agents tf_agents emoverments univerments utils.pys.py in validate_py_environment(环境，情节( 58提高价值Error( 59'给定time_step：％r与预期的time_step_spec：％r'％不匹配---> 60(time_step，time_step_spec(( 61 62操作= Random_policy.Action(time_step(.Action

value error：给定 time_step：timeStep(step_type = array(0(，奖励= array(0。，dtype = float32(，discount = array = array(1。，dtype = float32(，observation = arnay = array = array([[0。0.，0。]，， [0.，0.，0。]，，， [0.，0.，0。]((((不匹配预期的 time_step_spec：timeStep(step_type = arrayspec(shape =((，dtype = dtype('int32'(，name ='step_type'(，奖励= arrayspec(arrayspec(shape =((，dtype = dtype('float32'(，name ='reward'(，discount = boundedarrayspec(shape =((，dtype = dtype('float32'(，name ='discount'，name ='discount'，最小值，最大值，最大值，最大值= 1.0(，观察= boundeDarreaySpec(shape =(3，3(，dtype = dtype('int32'(，name ='tictactoe board state state spec'，minimim = -1，maximum = 1((

您的观察结果与规格不匹配，您需要将dtype=np.int32传递给NP数组以确保类型匹配。

相关内容

最新更新

热门标签：