使用 NLTK 的 SklearnClassifier 和 ClassifierBasedPOSTagger 构建自己的基于分类器的 POS 标记器



我正在尝试使用SklearnClassifierClassifierBasedPOSTagger构建我自己的基于分类器的POS标记器。我试过的代码如下:

from nltk.corpus import treebank
nltk.download('treebank')
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from nltk.tag.sequential import ClassifierBasedPOSTagger
bnb = SklearnClassifier(BernoulliNB())
bnb_tagger = ClassifierBasedPOSTagger(train=train_data,
classifier_builder=bnb.train)
# evaluate tagger on test data and sample sentence
print(bnb_tagger.evaluate(test_data))
# see results on our previously defined sentence
print(bnb_tagger.tag(nltk.word_tokenize(sentence)))

此代码产生以下错误:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:UsersABDULL~1.IMRAppDataLocalTemp/ipykernel_6580/266992580.py in <module>
4 
5 bnb = SklearnClassifier(BernoulliNB())
----> 6 bnb_tagger = ClassifierBasedPOSTagger(train=train_data,
7                                       classifier_builder=bnb.train)
8 
~Miniconda3envsnlp_courselibsite-packagesnltktagsequential.py in __init__(self, feature_detector, train, classifier_builder, classifier, backoff, cutoff_prob, verbose)
637 
638         if train:
--> 639             self._train(train, classifier_builder, verbose)
640 
641     def choose_tag(self, tokens, index, history):
~Miniconda3envsnlp_courselibsite-packagesnltktagsequential.py in _train(self, tagged_corpus, classifier_builder, verbose)
673         if verbose:
674             print("Training classifier ({} instances)".format(len(classifier_corpus)))
--> 675         self._classifier = classifier_builder(classifier_corpus)
676 
677     def __repr__(self):
~Miniconda3envsnlp_courselibsite-packagesnltkclassifyscikitlearn.py in train(self, labeled_featuresets)
110 
111         X, y = list(zip(*labeled_featuresets))
--> 112         X = self._vectorizer.fit_transform(X)
113         y = self._encoder.fit_transform(y)
114         self._clf.fit(X, y)
~Miniconda3envsnlp_courselibsite-packagessklearnfeature_extraction_dict_vectorizer.py in fit_transform(self, X, y)
288             Feature vectors; always 2-d.
289         
--> 290         return self._transform(X, fitting=True)
291 
292     def inverse_transform(self, X, dict_type=dict):
~Miniconda3envsnlp_courselibsite-packagessklearnfeature_extraction_dict_vectorizer.py in _transform(self, X, fitting)
233                     if feature_name in vocab:
234                         indices.append(vocab[feature_name])
--> 235                         values.append(self.dtype(v))
236 
237             indptr.append(len(indices))
TypeError: float() argument must be a string or a number, not 'NoneType'

如何做正确?

根据这个问题的评论,这是scikit-learn错误的结果。Scikit-learn的sklearn/feature_extraction/_dict_vectorizer.pyDictVectorizer_transform方法在输入参数X包含到None的映射时失败。根据Tom Aarsen的说法,我们现在可以用下面的例子来完成这项工作:

import nltk
from nltk.corpus import treebank
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from nltk.tag.sequential import ClassifierBasedPOSTagger
nltk.download('treebank')
data = treebank.tagged_sents()
train_data = data[:3]
test_data = data[3:]
class CustomClassifierBasedPOSTagger(ClassifierBasedPOSTagger):
def feature_detector(self, tokens, index, history):
return {
key: str(value) # Ensure that the feature value is a string. Converts None to 'None'
for key, value in super().feature_detector(tokens, index, history).items()
}
bnb = SklearnClassifier(BernoulliNB())
bnb_tagger = CustomClassifierBasedPOSTagger(train=train_data,
classifier_builder=bnb.train,
verbose=True)
sentence = "This is a sample sentence which I just made for fun."
# evaluate tagger on test data and sample sentence
print(bnb_tagger.evaluate(test_data))
# see results on our previously defined sentence
print(bnb_tagger.tag(nltk.word_tokenize(sentence)))

输出如下:

[nltk_data] Downloading package treebank to C:UsersTom/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
Constructing training corpus for classifier.
Training classifier (58 instances)
0.09338289371682999
[('This', 'NNP'), ('is', 'NNP'), ('a', 'NNP'), ('sample', 'NNP'), ('sentence', 'NNP'), ('which', 'NNP'), ('I', 'NNP'), ('just', 'NNP'), ('made', 'NNP'), ('for', 'NNP'), ('fun', 'NNP'), ('.', 'NNP')]

最新更新