拥抱人脸自动标记器 |值错误:无法实例化后端标记器



目标:修改此笔记本以使用albert-base-v2型号

错误出现在第1.3节中。

内核:conda_pytorch_p36。我做了重启&全部运行,并刷新工作目录中的文件视图。


列出了导致此错误的三种方式。我不确定我的案子属于哪一类。

第1.3节:

# define the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
configs.output_dir, do_lower_case=configs.do_lower_case)

追溯:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-1f864e3046eb> in <module>
140 # define the tokenizer
141 tokenizer = AutoTokenizer.from_pretrained(
--> 142         configs.output_dir, do_lower_case=configs.do_lower_case)
143 
144 # Evaluate the original FP32 BERT model
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
548             tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549             if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 550                 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
551             else:
552                 if tokenizer_class_py is not None:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1752             use_auth_token=use_auth_token,
1753             cache_dir=cache_dir,
-> 1754             **kwargs,
1755         )
1756 
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1880         # Instantiate tokenizer.
1881         try:
-> 1882             tokenizer = cls(*init_inputs, **init_kwargs)
1883         except OSError:
1884             raise OSError(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/albert/tokenization_albert_fast.py in __init__(self, vocab_file, tokenizer_file, do_lower_case, remove_space, keep_accents, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, **kwargs)
159             cls_token=cls_token,
160             mask_token=mask_token,
--> 161             **kwargs,
162         )
163 
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_fast.py in __init__(self, *args, **kwargs)
116         else:
117             raise ValueError(
--> 118                 "Couldn't instantiate the backend tokenizer from one of: n"
119                 "(1) a `tokenizers` library serialization file, n"
120                 "(2) a slow tokenizer instance to convert or n"
ValueError: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.

如果还有什么可以添加到帖子中的,请告诉我。

首先,我必须使用pip install sentencepiece

然而,在同一代码行中,我收到了sentencepiece的错误。

str()封装在两个参数周围产生相同的Traceback。

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-1f864e3046eb> in <module>
140 # define the tokenizer
141 tokenizer = AutoTokenizer.from_pretrained(
--> 142         configs.output_dir, do_lower_case=configs.do_lower_case)
143 
144 # Evaluate the original FP32 BERT model
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
548             tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549             if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 550                 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
551             else:
552                 if tokenizer_class_py is not None:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1752             use_auth_token=use_auth_token,
1753             cache_dir=cache_dir,
-> 1754             **kwargs,
1755         )
1756 
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1776                 copy.deepcopy(init_configuration),
1777                 *init_inputs,
-> 1778                 **(copy.deepcopy(kwargs)),
1779             )
1780         else:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1880         # Instantiate tokenizer.
1881         try:
-> 1882             tokenizer = cls(*init_inputs, **init_kwargs)
1883         except OSError:
1884             raise OSError(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/albert/tokenization_albert.py in __init__(self, vocab_file, do_lower_case, remove_space, keep_accents, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, sp_model_kwargs, **kwargs)
179 
180         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
--> 181         self.sp_model.Load(vocab_file)
182 
183     @property
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in Load(self, model_file, model_proto)
365       if model_proto:
366         return self.LoadFromSerializedProto(model_proto)
--> 367       return self.LoadFromFile(model_file)
368 
369 
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in LoadFromFile(self, arg)
169 
170     def LoadFromFile(self, arg):
--> 171         return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
172 
173     def DecodeIdsWithCheck(self, ids):
TypeError: not a string
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-1f864e3046eb> in <module>
140 # define the tokenizer
141 tokenizer = AutoTokenizer.from_pretrained(
--> 142         configs.output_dir, do_lower_case=configs.do_lower_case)
143 
144 # Evaluate the original FP32 BERT model
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
548             tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549             if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 550                 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
551             else:
552                 if tokenizer_class_py is not None:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1752             use_auth_token=use_auth_token,
1753             cache_dir=cache_dir,
-> 1754             **kwargs,
1755         )
1756 
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1776                 copy.deepcopy(init_configuration),
1777                 *init_inputs,
-> 1778                 **(copy.deepcopy(kwargs)),
1779             )
1780         else:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1880         # Instantiate tokenizer.
1881         try:
-> 1882             tokenizer = cls(*init_inputs, **init_kwargs)
1883         except OSError:
1884             raise OSError(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/albert/tokenization_albert.py in __init__(self, vocab_file, do_lower_case, remove_space, keep_accents, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, sp_model_kwargs, **kwargs)
179 
180         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
--> 181         self.sp_model.Load(vocab_file)
182 
183     @property
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in Load(self, model_file, model_proto)
365       if model_proto:
366         return self.LoadFromSerializedProto(model_proto)
--> 367       return self.LoadFromFile(model_file)
368 
369 
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in LoadFromFile(self, arg)
169 
170     def LoadFromFile(self, arg):
--> 171         return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
172 
173     def DecodeIdsWithCheck(self, ids):
TypeError: not a string

然后我不得不把参数换成型号名称:

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

第二部分详细介绍了SO的帖子。

最新更新