我正在尝试使用SklearnClassifier
和ClassifierBasedPOSTagger
构建我自己的基于分类器的POS标记器。我试过的代码如下:
from nltk.corpus import treebank
nltk.download('treebank')
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from nltk.tag.sequential import ClassifierBasedPOSTagger
bnb = SklearnClassifier(BernoulliNB())
bnb_tagger = ClassifierBasedPOSTagger(train=train_data,
classifier_builder=bnb.train)
# evaluate tagger on test data and sample sentence
print(bnb_tagger.evaluate(test_data))
# see results on our previously defined sentence
print(bnb_tagger.tag(nltk.word_tokenize(sentence)))
此代码产生以下错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
C:UsersABDULL~1.IMRAppDataLocalTemp/ipykernel_6580/266992580.py in <module>
4
5 bnb = SklearnClassifier(BernoulliNB())
----> 6 bnb_tagger = ClassifierBasedPOSTagger(train=train_data,
7 classifier_builder=bnb.train)
8
~Miniconda3envsnlp_courselibsite-packagesnltktagsequential.py in __init__(self, feature_detector, train, classifier_builder, classifier, backoff, cutoff_prob, verbose)
637
638 if train:
--> 639 self._train(train, classifier_builder, verbose)
640
641 def choose_tag(self, tokens, index, history):
~Miniconda3envsnlp_courselibsite-packagesnltktagsequential.py in _train(self, tagged_corpus, classifier_builder, verbose)
673 if verbose:
674 print("Training classifier ({} instances)".format(len(classifier_corpus)))
--> 675 self._classifier = classifier_builder(classifier_corpus)
676
677 def __repr__(self):
~Miniconda3envsnlp_courselibsite-packagesnltkclassifyscikitlearn.py in train(self, labeled_featuresets)
110
111 X, y = list(zip(*labeled_featuresets))
--> 112 X = self._vectorizer.fit_transform(X)
113 y = self._encoder.fit_transform(y)
114 self._clf.fit(X, y)
~Miniconda3envsnlp_courselibsite-packagessklearnfeature_extraction_dict_vectorizer.py in fit_transform(self, X, y)
288 Feature vectors; always 2-d.
289
--> 290 return self._transform(X, fitting=True)
291
292 def inverse_transform(self, X, dict_type=dict):
~Miniconda3envsnlp_courselibsite-packagessklearnfeature_extraction_dict_vectorizer.py in _transform(self, X, fitting)
233 if feature_name in vocab:
234 indices.append(vocab[feature_name])
--> 235 values.append(self.dtype(v))
236
237 indptr.append(len(indices))
TypeError: float() argument must be a string or a number, not 'NoneType'
如何做正确?
根据这个问题的评论,这是scikit-learn错误的结果。Scikit-learn的sklearn/feature_extraction/_dict_vectorizer.py
中DictVectorizer
的_transform
方法在输入参数X
包含到None
的映射时失败。根据Tom Aarsen的说法,我们现在可以用下面的例子来完成这项工作:
import nltk
from nltk.corpus import treebank
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from nltk.tag.sequential import ClassifierBasedPOSTagger
nltk.download('treebank')
data = treebank.tagged_sents()
train_data = data[:3]
test_data = data[3:]
class CustomClassifierBasedPOSTagger(ClassifierBasedPOSTagger):
def feature_detector(self, tokens, index, history):
return {
key: str(value) # Ensure that the feature value is a string. Converts None to 'None'
for key, value in super().feature_detector(tokens, index, history).items()
}
bnb = SklearnClassifier(BernoulliNB())
bnb_tagger = CustomClassifierBasedPOSTagger(train=train_data,
classifier_builder=bnb.train,
verbose=True)
sentence = "This is a sample sentence which I just made for fun."
# evaluate tagger on test data and sample sentence
print(bnb_tagger.evaluate(test_data))
# see results on our previously defined sentence
print(bnb_tagger.tag(nltk.word_tokenize(sentence)))
输出如下:
[nltk_data] Downloading package treebank to C:UsersTom/nltk_data...
[nltk_data] Package treebank is already up-to-date!
Constructing training corpus for classifier.
Training classifier (58 instances)
0.09338289371682999
[('This', 'NNP'), ('is', 'NNP'), ('a', 'NNP'), ('sample', 'NNP'), ('sentence', 'NNP'), ('which', 'NNP'), ('I', 'NNP'), ('just', 'NNP'), ('made', 'NNP'), ('for', 'NNP'), ('fun', 'NNP'), ('.', 'NNP')]