DQN 在验证模式下性能不佳

我做了一个DQN来学习井字游戏。到目前为止，我让经纪人玩所有动作，以便看看它是否学会了总是做出导致平局或获胜的合法动作。在训练网络大约 10.000 场比赛后，它能够在大约 30% 到 40% 的比赛中平局或获胜。

之后，我想在评估模式下测试网络。不幸的是，它的表现比只有大约 1% 的平局或获胜要差得多。用于训练和测试的代码如下所示：

def train(n_games, lr):
env = TicTacToe()
brain = Agent(gamma=0.99, epsilon=1.0, batch_size=512, n_actions=9,
input_dims=[10], lr=lr)
scores = []
test_scores = []
eps_history = []
for i in range(n_games):
if i % 100 == 0 and i > 0:
avg_score = np.mean(scores[max(0, i-100):(i+1)])
print('episode:', i, 'average score %.3f:' % avg_score, 'epsilon:', brain.epsilon)
score = 0
eps_history.append(brain.epsilon)
observation = env.reset()
done = False
while not done:
action = brain.choose_action(observation)
observation_, reward, done, info = env.step(action)
score += reward
brain.store_transition(observation, action, reward, observation_, done)
brain.learn()
observation = observation_
scores.append(score)
# testing the performance
for i in range(100):
brain.epsilon = 0.3
score = 0
observation = env.reset()
done = False
while not done:
action = brain.choose_action(observation)
observation_, reward, done, info = env.step(action)
score += reward
observation = observation_
test_scores.append(score)
print(np.mean(test_scores))

唯一改变的是两行：

brain.store_transition(observation, action, reward, observation_, done)
brain.learn()

代理类如下所示：

class Agent(object):
def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
max_mem_size=1_000_000, eps_end=0.05, eps_dec=0.99995):
self.gamma = gamma
self.epsilon = epsilon
self.eps_end = eps_end
self.eps_dec = eps_dec
self.lr = lr
self.batch_size = batch_size
self.n_actions = n_actions
self.action_space = [i for i in range(n_actions)]
self.mem_size = max_mem_size        # we need a memory to store experiences and randommly sample over them
self.mem_counter = 0
self.Q_eval = DQN(lr=self.lr, n_actions=self.n_actions, input_dims=input_dims,
fc1_dims=32, fc2_dims=32)
self.Q_target = DQN(lr=self.lr, n_actions=self.n_actions, input_dims=input_dims,
fc1_dims=64, fc2_dims=64)
self.state_memory = np.zeros((self.mem_size, *input_dims))
self.new_state_memeory = np.zeros((self.mem_size, *input_dims))
self.action_memory = np.zeros((self.mem_size, self.n_actions),
dtype=np.uint8)
self.reward_memory = np.zeros(self.mem_size)
self.terminal_memory = np.zeros(self.mem_size, dtype=np.uint8)      # sequence of done flags
def store_transition(self, state, action, reward, state_, terminal):
index = self.mem_counter % self.mem_size
self.state_memory[index] = state
actions = np.zeros(self.n_actions)
actions[action] = 1.0       # one hot encoding of actions
self.action_memory[index] = actions
self.reward_memory[index] = reward
self.terminal_memory[index] = not terminal
self.new_state_memeory[index] = state_
self.mem_counter += 1
def choose_action(self, observation):
rand = np.random.random()
if rand < self.epsilon:
action = np.random.choice(self.action_space)
else:
actions = self.Q_eval.forward(observation)
action = torch.argmax(actions).item()
return action
def learn(self):
if self.mem_counter > self.batch_size:
self.Q_eval.optimizer.zero_grad()
max_mem = self.mem_counter if self.mem_counter < self.mem_size 
else self.mem_size
batch = np.random.choice(max_mem, self.batch_size)
state_batch = self.state_memory[batch]
action_batch = self.action_memory[batch]
action_values = np.array(self.action_space, dtype=np.int32)
action_indices = np.dot(action_batch, action_values)
reward_batch = self.reward_memory[batch]
terminal_batch = self.terminal_memory[batch]
new_state_batch = self.new_state_memeory[batch]
reward_batch = torch.Tensor(reward_batch).to(self.Q_eval.device)
terminal_batch = torch.Tensor(terminal_batch).to(self.Q_eval.device)
q_eval = self.Q_eval.forward(state_batch).to(self.Q_eval.device)
#q_target = self.Q_target.forward(state_batch).to(self.Q_target.device)  # alternative to q_eval.clone()
q_target = q_eval.clone()
q_next = self.Q_eval.forward(new_state_batch).to(self.Q_eval.device)
batch_index = np.arange(self.batch_size, dtype=np.int32)
q_target[batch_index, action_indices] = reward_batch + 
self.gamma * torch.max(q_next, dim=1)[0] * terminal_batch
self.epsilon = self.epsilon * self.eps_dec if self.epsilon > 
self.eps_end else self.eps_end
loss = F.smooth_l1_loss(q_target, q_eval).to(self.Q_eval.device)
loss.backward()
self.Q_eval.optimizer.step()

谁能解释为什么会这样？感谢您的帮助！

附加信息：

事实证明，无论网络得到什么状态，网络总是在计算相同的q值，所以基本上是这样的行：

actions = self.Q_eval.forward(observation)

始终给出相同的 q 值，即使对于不同的状态观测也是如此。例如：

[1, 0, 0, 0, 0, 0, 0, 0, 0, -1]
tensor([0.3155, 0.1449, 0.2217, 0.2078, 0.1867, 0.1810, 0.2689, 0.1995, 0.3029],
grad_fn=<AddBackward0>)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
tensor([0.3155, 0.1449, 0.2217, 0.2078, 0.1867, 0.1810, 0.2689, 0.1995, 0.3029],
grad_fn=<AddBackward0>)
[1, 0, 0, 0, 0, 0, 0, 0, 0, -1]
tensor([0.3155, 0.1449, 0.2217, 0.2078, 0.1867, 0.1810, 0.2689, 0.1995, 0.3029],
grad_fn=<AddBackward0>)

第一行表示传递给 forward 方法的输入状态，第二行是可能操作的相应 q值。

事实证明，我的学习率太大了，大约 1000 倍。

相关内容

最新更新

热门标签：