我有一堆文档,分为大约350个类。我正在尝试构建一个 TF-IDF 多项式模型来预测新文档的类别。一切似乎都运行良好,除了测试预测只有一个值(即使我对数千个文档运行测试)。我错过了什么?
以下是相关代码:
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'w+')
stemmer = SnowballStemmer("english")
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer(norm='l1', use_idf=True, smooth_idf=False, sublinear_tf=False)
clf = MultinomialNB()
mycsv = pd.read_csv("C:/DocumentsToClassify.csv", encoding='latin-1')
Document_text=mycsv.document.str.lower()
y=mycsv.document_group
Y=[]
stemmed_documents = []
for i in range(0, 50000 ,2):
tokenized_document = tokenizer.tokenize(Document_text[i])
stemmed_document = ""
for w in tokenized_document:
if w not in stop_words:
w = re.sub(r'd+', '', w)
if w is not None:
stemmed_document=stemmed_document+" "+stemmer.stem(w)
stemmed_documents=np.append(stemmed_documents,stemmed_document)
Y=np.append(Y,y[i])
Y_correct=[]
test_documents = []
for i in range(1,50000,4):
tokenized_document = tokenizer.tokenize(Document_text[i])
stemmed_document = ""
for w in tokenized_document:
if w not in stop_words:
w = re.sub(r'd+', '', w)
if w is not None:
stemmed_document=stemmed_document+" "+stemmer.stem(w)
test_documents=np.append(test_documents,stemmed_document)
Y_correct=np.append(Y_correct,y[i])
Word_counts = count_vect.fit_transform(stemmed_documents)
Words_tfidf = tfidf_transformer.fit_transform(Word_counts)
Word_counts_test = count_vect.transform(test_documents)
Words_tfidf_test = tfidf_transformer.transform(Word_counts_test)
# Training
clf.fit(Words_tfidf, Y)
# Test
Ynew=clf.predict(Words_tfidf_test)
在昨天为此苦苦挣扎了一段时间之后,我想出了一个解决方案 - 从 MultinomialNB() 切换到 SGDClassifier()。我不确定为什么它不适用于多项式NB(),但SDG效果很好。这是相关的 - 并且大大缩短 - 代码(紧随 http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer(norm='l1', use_idf=True, smooth_idf=True, sublinear_tf=False)),
('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)),
])
# Training dataset
train_data = pd.read_csv("A:/DocumentsWithGroupTrain.csv", encoding='latin-1')
# Test dataset
test_data = pd.read_csv("A:/DocumentsWithGroupTest.csv", encoding='latin-1')
text_clf.fit(train_data.document, train_data.doc_group)
predicted = text_clf.predict(test_data.document)
print(np.mean(predicted == test_data.doc_group))