我跟随这篇文章寻找文本相似度。我的代码是:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
documents = [
"Vodafone Wins ₹ 20,000 Crore Tax Arbitration Case Against Government",
"Voda Idea shares jump nearly 15% as Vodafone wins retro tax case in Hague",
"Gold prices today fall for 4th time in 5 days, down ₹6500 from last month high",
"Silver futures slip 0.36% to Rs 59,415 per kg, down over 12% this week",
"Amazon unveils drone that films inside your home. What could go wrong?",
"IPHONE 12 MINI PERFORMANCE MAY DISAPPOINT DUE TO THE APPLE B14 CHIP",
"Delhi Capitals vs Chennai Super Kings: Prithvi Shaw shines as DC beat CSK to post second consecutive win in IPL",
"French Open 2020: Rafael Nadal handed tough draw in bid for record-equaling 20th Grand Slam"
]
model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
当运行上面的代码时,我得到一个错误:
完整:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~anaconda3envspy3_nlplibtarfile.py in nti(s)
188 s = nts(s, "ascii", "strict")
--> 189 n = int(s.strip() or "0", 8)
190 except ValueError:
ValueError: invalid literal for int() with base 8: 'ld_tenso'
During handling of the above exception, another exception occurred:
InvalidHeaderError Traceback (most recent call last)
~anaconda3envspy3_nlplibtarfile.py in next(self)
2298 try:
-> 2299 tarinfo = self.tarinfo.fromtarfile(self)
2300 except EOFHeaderError as e:
~anaconda3envspy3_nlplibtarfile.py in fromtarfile(cls, tarfile)
1092 buf = tarfile.fileobj.read(BLOCKSIZE)
-> 1093 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1094 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
~anaconda3envspy3_nlplibtarfile.py in frombuf(cls, buf, encoding, errors)
1034
-> 1035 chksum = nti(buf[148:156])
1036 if chksum not in calc_chksums(buf):
~anaconda3envspy3_nlplibtarfile.py in nti(s)
190 except ValueError:
--> 191 raise InvalidHeaderError("invalid header")
192 return n
InvalidHeaderError: invalid header
During handling of the above exception, another exception occurred:
ReadError Traceback (most recent call last)
~anaconda3envspy3_nlplibsite-packagestorchserialization.py in _load(f, map_location,
pickle_module, **pickle_load_args)
594 try:
--> 595 return legacy_load(f)
596 except tarfile.TarError:
~anaconda3envspy3_nlplibsite-packagestorchserialization.py in legacy_load(f)
505
--> 506 with closing(tarfile.open(fileobj=f, mode='r:', format=tarfile.PAX_FORMAT)) as
tar,
507 mkdtemp() as tmpdir:
~anaconda3envspy3_nlplibtarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
1590 raise CompressionError("unknown compression type %r" % comptype)
-> 1591 return func(name, filemode, fileobj, **kwargs)
1592
~anaconda3envspy3_nlplibtarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
1620 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
-> 1621 return cls(name, mode, fileobj, **kwargs)
1622
~anaconda3envspy3_nlplibtarfile.py in __init__(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
1483 self.firstmember = None
-> 1484 self.firstmember = self.next()
1485
~anaconda3envspy3_nlplibtarfile.py in next(self)
2310 elif self.offset == 0:
-> 2311 raise ReadError(str(e))
2312 except EmptyHeaderError:
ReadError: invalid header
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
~anaconda3envspy3_nlplibsite-packagestransformersmodeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
1210 try:
-> 1211 state_dict = torch.load(resolved_archive_file, map_location="cpu")
1212 except Exception:
~anaconda3envspy3_nlplibsite-packagestorchserialization.py in load(f, map_location, pickle_module, **pickle_load_args)
425 pickle_load_args['encoding'] = 'utf-8'
--> 426 return _load(f, map_location, pickle_module, **pickle_load_args)
427 finally:
~anaconda3envspy3_nlplibsite-packagestorchserialization.py in _load(f, map_location, pickle_module, **pickle_load_args)
598 # .zip is used for torch.jit.save and will throw an un-pickling error here
--> 599 raise RuntimeError("{} is a zip archive (did you mean to use torch.jit.load()?)".format(f.name))
600 # if not a tarfile, reset file offset and proceed
RuntimeError: C:Usersuser1/.cachetorchsentence_transformerssentence-transformers_bert-base-nli-mean-tokenspytorch_model.bin is a zip archive (did you mean to use torch.jit.load()?)
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-3-bba56aac60aa> in <module>
----> 1 model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
~anaconda3envspy3_nlplibsite-packagessentence_transformersSentenceTransformer.py in __init__(self, model_name_or_path, modules, device, cache_folder)
88
89 if os.path.exists(os.path.join(model_path, 'modules.json')): #Load as SentenceTransformer model
---> 90 modules = self._load_sbert_model(model_path)
91 else: #Load with AutoModel
92 modules = self._load_auto_model(model_path)
~anaconda3envspy3_nlplibsite-packagessentence_transformersSentenceTransformer.py in _load_sbert_model(self, model_path)
820 for module_config in modules_config:
821 module_class = import_from_string(module_config['type'])
--> 822 module = module_class.load(os.path.join(model_path, module_config['path']))
823 modules[module_config['name']] = module
824
~anaconda3envspy3_nlplibsite-packagessentence_transformersmodelsTransformer.py in load(input_path)
122 with open(sbert_config_path) as fIn:
123 config = json.load(fIn)
--> 124 return Transformer(model_name_or_path=input_path, **config)
125
126
~anaconda3envspy3_nlplibsite-packagessentence_transformersmodelsTransformer.py in __init__(self, model_name_or_path, max_seq_length, model_args, cache_dir, tokenizer_args, do_lower_case, tokenizer_name_or_path)
27
28 config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
---> 29 self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
30 self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
31
~anaconda3envspy3_nlplibsite-packagestransformersmodelsautoauto_factory.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
393 if type(config) in cls._model_mapping.keys():
394 model_class = _get_model_class(config, cls._model_mapping)
--> 395 return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
396 raise ValueError(
397 f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.n"
~anaconda3envspy3_nlplibsite-packagestransformersmodeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
1212 except Exception:
1213 raise OSError(
-> 1214 f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' "
1215 f"at '{resolved_archive_file}'"
1216 "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
OSError: Unable to load weights from pytorch checkpoint file for 'C:Usersuser1/.cachetorchsentence_transformerssentence-transformers_bert-base-nli-mean-tokens' at 'C:Usersuser1/.cachetorchsentence_transformerssentence-transformers_bert-base-nli-mean-tokenspytorch_model.bin'If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True.
短:
OSError:无法从pytorch检查点文件"C:Usersuser1/"加载权重。cachetorchsentence_transformerssentence- transformer_bert -base-nli-mean-tokens' at 'C:Usersuser1/.cachetorchsentence_transformerssentence- transformer_bert -base-nli-mean-tokenspytorch_model.bin'如果您试图从TF 2.0检查点加载PyTorch模型,请设置from_tf=True。
我有pytorch_model.bin在'。缓存火炬 sentence_transformers sentence-transformers_bert-base-nli-mean-tokens"文件夹。
为什么我得到这个错误?
您可能需要使用没有sentence_transformer的模型。
下面的代码来自https://www.sbert.net/examples/applications/computing-embeddings/README.html
根据我的理解,从异常中您需要将from_tf=True传递给autommodel。
from transformers import AutoTokenizer, AutoModel
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
#Sentences we want sentence embeddings for
sentences = ['This framework generates embeddings for each input sentence',
'Sentences are passed as a list of string.',
'The quick brown fox jumps over the lazy dog.']
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens',from_tf=True)
#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
#Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
错误的原因似乎是预训练的模型权重文件不可用或不可加载。
你可以尝试加载预训练模型权重文件:
from transformers import AutoModel
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
参考:https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens
此外,模特的拥抱脸页面上写道:此模型已弃用。请不要使用它,因为它会产生低质量的句子嵌入。你可以在这里找到推荐的句子嵌入模型:SBERT.net -预训练模型
也许你会想看一看。