我试图修改本教程中的代码,使其适应多类数据(我有55个不同的类(。一个错误被触发,我不确定根本原因。我对本教程所做的更改已在同一行注释中进行了注释。
两种解决方案之一可以满足这个问题:
(A( 帮助识别错误的根本原因,或
(B( 使用PyTorch LSTM 进行多类别分类的样板脚本
import spacy
import torchtext
from torchtext import data
import re
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)
fields = [(None,None),('text', TEXT), ('wage_label', LABEL)]
train_torch, test_torch = data.TabularDataset.splits(path='/Users/jdmoore7/Desktop/Python Projects/560_capstone/',
format='csv',
train='train_text_target.csv',
test='test_text_target.csv',
fields=fields,
skip_header=True)
import random
train_data, valid_data = train_torch.split(random_state = random.seed(SEED))
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data,
max_size = MAX_VOCAB_SIZE,
vectors = "glove.6B.100d",
unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_torch),
batch_size = BATCH_SIZE,
sort_within_batch = True,
device = device)
import torch.nn as nn
class RNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
self.rnn = nn.LSTM(embedding_dim,
hidden_dim,
num_layers=n_layers,
bidirectional=bidirectional,
dropout=dropout)
self.fc = nn.Linear(hidden_dim * 2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text, text_lengths):
#text = [sent len, batch size]
embedded = self.dropout(self.embedding(text))
#embedded = [sent len, batch size, emb dim]
#pack sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
packed_output, (hidden, cell) = self.rnn(packed_embedded)
#unpack sequence
output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
#output = [sent len, batch size, hid dim * num directions]
#output over padding tokens are zero tensors
#hidden = [num layers * num directions, batch size, hid dim]
#cell = [num layers * num directions, batch size, hid dim]
#concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
#and apply dropout
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
#hidden = [batch size, hid dim * num directions]
return self.fc(hidden)
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab) ### changed from previous value (1)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model = RNN(INPUT_DIM,
EMBEDDING_DIM,
HIDDEN_DIM,
OUTPUT_DIM,
N_LAYERS,
BIDIRECTIONAL,
DROPOUT,
PAD_IDX)
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss() # Previously: criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)
def binary_accuracy(preds, y):
"""
Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
"""
#round predictions to the closest integer
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds == y).float() #convert into float for division
acc = correct.sum() / len(correct)
return acc
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad()
text, text_lengths = batch.text
predictions = model(text, text_lengths).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for batch in iterator:
text, text_lengths = batch.text
predictions = model(text, text_lengths).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
import time
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
以上所有操作都很顺利,是下一个代码块触发了错误:
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut2-model.pt')
print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f't Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-888-c1b298b1eeea> in <module>
7 start_time = time.time()
8
----> 9 train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
10 valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
11
<ipython-input-885-9a57198441ec> in train(model, iterator, optimizer, criterion)
6 model.train()
7
----> 8 for batch in iterator:
9
10 optimizer.zero_grad()
~/opt/anaconda3/lib/python3.7/site-packages/torchtext/data/iterator.py in __iter__(self)
140 while True:
141 self.init_epoch()
--> 142 for idx, minibatch in enumerate(self.batches):
143 # fast-forward if loaded from state
144 if self._iterations_this_epoch > idx:
~/opt/anaconda3/lib/python3.7/site-packages/torchtext/data/iterator.py in pool(data, batch_size, key, batch_size_fn, random_shuffler, shuffle, sort_within_batch)
284 for p in batch(data, batch_size * 100, batch_size_fn):
285 p_batch = batch(sorted(p, key=key), batch_size, batch_size_fn)
--> 286 if sort_within_batch
287 else batch(p, batch_size, batch_size_fn)
288 if shuffle:
TypeError: '<' not supported between instances of 'Example' and 'Example'
最后,PyTorch论坛为这个错误打开了一个问题,然而,产生它的代码并不相似,所以我理解这是一个单独的问题。
BucketIterator
对数据进行排序,以生成具有相似长度的示例的批,以避免填充过多。为此,它需要知道排序标准是什么,应该是文本长度。由于它不固定于特定的数据布局,您可以自由选择它应该使用哪个字段,但这也意味着您必须将该信息提供给sort_key
。
在您的情况下,有两个可能的字段,text
和wage_label
,您希望根据text
的长度对其进行排序。
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_torch),
batch_size = BATCH_SIZE,
sort_within_batch = True,
sort_key = lambda x: len(x.text),
device = device)
您可能想知道为什么它在教程中有效,但在您的示例中无效。原因是,如果没有指定sort_key
,它会将其推迟到底层数据集。在本教程中,他们使用了IMDB数据集,该数据集将sort_key
定义为x.text
。您的自定义数据集没有定义它,因此您需要手动指定它。