我用pytorch
构建了Bi-LSTM语言模型,发现在200个历元之后,该模型突然只返回了Nan丢失的无意义的令牌,而之前它返回了合理的令牌。
请参考以下型号代码:
# optimizer = torch.optim.Adam(model.parameters(), lr=0.009, amsgrad=False)
class BiLSTM(nn.Module):
def __init__(self, voc_size, hidn_size, emb_size=300):
super().__init__()
self.voc_size = voc_size
self.emb_size = emb_size
self.hidn_size = hidn_size
self.emb = nn.Embedding(num_embeddings=voc_size, embedding_dim=emb_size)
self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hidn_size, bidirectional=True)
self.lm_out = nn.Linear(hidn_size*2, voc_size)
self.dropout = nn.Dropout(p=0.3)
def forward(self, x, prev_state):
state_h, state_c = prev_state
bs = len(x)
emb = self.emb(x)
emb = emb.permute(1,0,-1)
out, (state_h, state_c) = self.lstm(emb, (state_h[:,:bs,:].contiguous(), state_c[:,:bs,:].contiguous()))
forward_out = out[:, :, :self.hidn_size]
backward_out = out[:, :, self.hidn_size:]
concat_h = torch.cat([forward_out[:-2], backward_out[2:]], dim=2)
final_out = self.lm_out(self.dropout(concat_h.permute(1,0,2)))
return final_out.view(final_out.size()[0]*final_out.size()[1], final_out.size()[-1]), (state_h, state_c)
问题是由exploding gradient
引起的。我通过检查层的梯度和权重发现了这一点:
model.lm_out.weight
>>>
tensor([[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
device='cuda:0', requires_grad=True)
model.lm_out.weight.grad
>>>
tensor([[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
device='cuda:0', requires_grad=True)
所以我编辑我的代码像:
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
optimizer.step()
它解决了我的问题。