我无法通过Huggingface通过Trainer正确传递我的编码数据(带有隐藏状态)。下面是带参数的对Trainer的调用和完整的回溯。我真的不确定从哪里开始这个错误,因为我相信我已经满足了传递编码数据的所有要求,除非传递的输入应该包括标签。
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
pred = pred.predictions.argmax(-1)
f1 = f1_score(labels, pred, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
from transformers import Trainer, TrainingArguments
batch_size = 10
logging_steps = len(transcripts_encoded["train"]) // batch_size
model_name = f"{model_checkpoint}-finetuned-transcripts"
training_args = TrainingArguments(output_dir=model_name,
num_train_epochs=2,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
evaluation_strategy="epoch",
disable_tqdm=False,
logging_steps=logging_steps,
push_to_hub=False,
log_level="error")
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
compute_metrics=compute_metrics,
train_dataset=transcripts_encoded["train"],
eval_dataset=transcripts_encoded["valid"],
tokenizer=tokenizer)
trainer.train();
Here is the full traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-124-76d295da3120> in <module>
24 tokenizer=tokenizer)
25
---> 26 trainer.train();
/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1503 resume_from_checkpoint=resume_from_checkpoint,
1504 trial=trial,
-> 1505 ignore_keys_for_eval=ignore_keys_for_eval,
1506 )
1507
/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1747 tr_loss_step = self.training_step(model, inputs)
1748 else:
-> 1749 tr_loss_step = self.training_step(model, inputs)
1750
1751 if (
/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in training_step(self, model, inputs)
2506
2507 with self.compute_loss_context_manager():
-> 2508 loss = self.compute_loss(model, inputs)
2509
2510 if self.args.n_gpu > 1:
/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
2552 if isinstance(outputs, dict) and "loss" not in outputs:
2553 raise ValueError(
-> 2554 "The model did not return a loss from the inputs, only the following keys: "
2555 f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
2556 )
ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.
我期待它的训练细节(f1,损失,准确性等)。我的假设是,具有隐藏状态的编码数据的结构不适合模型根据参数集进行训练。
已更新的模型代码:这里是我加载和分割的地方
category_data = load_dataset("csv", data_files="testdatafinal.csv")
category_data = category_data.remove_columns(["someid", "someid", "somedimension"])
category_data = category_data['train']
train_testvalid = category_data.train_test_split(test_size=0.3)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
from datasets.dataset_dict import DatasetDict
cd = DatasetDict({
'train': train_testvalid['train'],
'test': test_valid['test'],
'valid': test_valid['train']})
print(cd)
DatasetDict({
train: Dataset({
features: ['Transcript', 'Primary Label'],
num_rows: 646
})
test: Dataset({
features: ['Transcript', 'Primary Label'],
num_rows: 139
})
valid: Dataset({
features: ['Transcript', 'Primary Label'],
num_rows: 139
})
})
这里是我抓取模型检查点的地方
model_checkpoint = 'distilbert-base-uncased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_checkpoint).to(device)
这里是我映射编码文本的地方
transcripts_encoded_one = transcripts_encoded.set_format("torch",
columns=["input_ids", "attention_mask", "Primary Label"])
这里是我提取隐藏状态然后映射的地方
def extract_hidden_states(batch):
#Place model inputs on the GPU/CPU
inputs = {k:v.to(device) for k, v in batch.items()
if k in tokenizer.model_input_names}
#Extract last hidden states
with torch.no_grad():
last_hidden_state = model(**inputs).last_hidden_state
# Return vecot for [CLS] Token
return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}
transcripts_hidden = transcripts_encoded.map(extract_hidden_states, batched=True)
调用AutoModel
from transformers import AutoModelForSequenceClassification
num_labels = 10
model =(AutoModelForSequenceClassification
.from_pretrained(model_checkpoint, num_labels=num_labels)
.to(device))
精度指标
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
pred = pred.predictions.argmax(-1)
f1 = f1_score(labels, pred, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
<<p>教练/strong>from transformers import Trainer, TrainingArguments
batch_size = 10
logging_steps = len(transcripts_encoded_one["train"]) // batch_size
model_name = f"{model_checkpoint}-finetuned-transcripts"
training_args = TrainingArguments(output_dir=model_name,
num_train_epochs=2,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
evaluation_strategy="epoch",
disable_tqdm=False,
logging_steps=logging_steps,
push_to_hub=False,
log_level="error")
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
compute_metrics=compute_metrics,
train_dataset=transcripts_encoded_one["train"],
eval_dataset=transcripts_encoded_one["valid"],
tokenizer=tokenizer)
trainer.train();
我已经尝试通过"transcripts_encoded(没有隐藏状态)和"transcripts_hidden(与隐藏状态)作为训练和验证分裂,两者都产生相同的错误
trainer.train_dataset[0]
{'Primary Label': 'cancel',
'input_ids': tensor([ 101, 2047, 3446, 2003, 2205, 6450, 2005, 1996, 2051, 1045,
2064, 5247, 3752, 4790, 1012, 2009, 2001, 2026, 5165, 2000,
6509, 2017, 2651, 999, 4067, 2017, 2005, 3967, 2075, 1996,
2047, 2259, 2335, 999, 2031, 1037, 6919, 2717, 1997, 1996,
2154, 999, 2994, 3647, 1998, 7965, 999, 2065, 2045, 2003,
2505, 2842, 2057, 2089, 2022, 2583, 2000, 6509, 2017, 2007,
3531, 2514, 2489, 2000, 3967, 2149, 2153, 1012, 1045, 2001,
2074, 2667, 2000, 17542, 2026, 15002, 1012, 2038, 2009, 2042,
13261, 1029, 7632, 1010, 2045, 999, 1045, 3246, 2017, 1005,
2128, 2725, 2092, 2651, 1012, 4067, 2017, 2005, 3967, 2075,
102]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1])}
如果可能的话,您可以添加您的模型代码吗?根据你的指示和描述,你应该使用BartForSequenceClassification
。如果你使用的是BartForSequenceClassification
,我认为最大的可能性是你的训练数据集没有标签。
loss = None
if labels is not None:
...
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return Seq2SeqSequenceClassifierOutput(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
transformers
中的modeling_outputs
将删除值为None
的键,然后将增加您描述的ValueError
。
谢谢这么详细的代码。我找到了问题所在。您应该将TrainingArguments.label_names
设置为["Primary label"]或将Primary Label
更改为包含小写字母"label"的任何标签字符串。比如Primary label
。详情请参见transformers.utils.generic.find_labels
。否则,它将使用默认的标签名称而不是Primary Label
。此外,您必须将标签映射到连续整数,而不是cancel
!!