你好,我是scikit learn的新手,我正在尝试做一些文本多类分类,我正在遵循本教程.
我的数据集'fipdl', 'lna','m5s','pd'
有 4 个类,所以我得到了 4 个文件夹(一个用于类(,每个文件夹包含 120 个 txt 文件,其中包含大约 25 行文本(facebook 状态(。我用90%用于训练,10%用于测试.
我的 10% 的 txt 文件名以"ts"开头,我正在使用这些文件名进行测试.
所以我的代码是:
import sys
import os
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
def usage():
print("Usage:")
print("python %s <data_dir>" % sys.argv[0])
if __name__ == '__main__':
if len(sys.argv) < 2:
usage()
sys.exit(1)
data_dir = sys.argv[1]
classes = ['fipdl', 'lna','m5s','pd']
# Read the data
train_data = []
train_labels = []
test_data = []
test_labels = []
for curr_class in classes:
dirname = os.path.join(data_dir, curr_class)
for fname in os.listdir(dirname):
with open(os.path.join(dirname, fname), 'r') as f:
content = f.read()
if fname.startswith('ts'):
test_data.append(content)
test_labels.append(curr_class)
else:
train_data.append(content)
train_labels.append(curr_class)
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
max_df = 0.8,
sublinear_tf=True,
use_idf=True)
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)
# Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
t0 = time.time()
classifier_rbf.fit(train_vectors, train_labels)
t1 = time.time()
prediction_rbf = classifier_rbf.predict(test_vectors)
t2 = time.time()
time_rbf_train = t1-t0
time_rbf_predict = t2-t1
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_labels)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
classifier_liblinear.fit(train_vectors, train_labels)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(test_vectors)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1
# Print results in a nice table
print("Results for SVC(kernel=rbf)")
print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
print(classification_report(test_labels, prediction_rbf))
print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(test_labels, prediction_linear))
print("Results for LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(test_labels, prediction_liblinear))
输出:
Results for SVC(kernel=rbf)
Training time: 0.940005s; Prediction time: 0.055970s
precision recall f1-score support
fipdl 1.00 1.00 1.00 11
lna 1.00 1.00 1.00 11
m5s 1.00 1.00 1.00 11
pd 1.00 1.00 1.00 11
avg / total 1.00 1.00 1.00 44
Results for SVC(kernel=linear)
Training time: 0.941262s; Prediction time: 0.056382s
precision recall f1-score support
fipdl 1.00 1.00 1.00 11
lna 1.00 1.00 1.00 11
m5s 1.00 1.00 1.00 11
pd 1.00 1.00 1.00 11
avg / total 1.00 1.00 1.00 44
Results for LinearSVC()
Training time: 0.034038s; Prediction time: 0.000323s
precision recall f1-score support
fipdl 1.00 1.00 1.00 11
lna 1.00 1.00 1.00 11
m5s 1.00 1.00 1.00 11
pd 1.00 1.00 1.00 11
avg / total 1.00 1.00 1.00 44
现在结果似乎好得令人难以置信,因为每种方法都给了我 1 的精度.
我认为尝试预测我传递的字符串而不是测试集也会很好,以便进行更多测试,所以我将原始代码更改为:
import sys
import os
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
def usage():
print("Usage:")
print("python %s <data_dir>" % sys.argv[0])
if __name__ == '__main__':
if len(sys.argv) < 2:
usage()
sys.exit(1)
data_dir = sys.argv[1]
classes = ['fipdl', 'lna','m5s','pd']
# Read the data
train_data = []
train_labels = []
test_data = []
test_labels = []
for curr_class in classes:
dirname = os.path.join(data_dir, curr_class)
for fname in os.listdir(dirname):
with open(os.path.join(dirname, fname), 'r') as f:
content = f.read()
if fname.startswith('ts'):
test_data.append(content)
test_labels.append(curr_class)
else:
train_data.append(content)
train_labels.append(curr_class)
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
max_df = 0.8,
sublinear_tf=True,
use_idf=True)
string = ['string to predict'] #my string
vector = vectorizer.transform(string) #convert
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)
# Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
t0 = time.time()
classifier_rbf.fit(train_vectors, train_labels)
t1 = time.time()
prediction_rbf = classifier_rbf.predict(vector) #predict
t2 = time.time()
time_rbf_train = t1-t0
time_rbf_predict = t2-t1
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_labels)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
classifier_liblinear.fit(train_vectors, train_labels)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(test_vectors)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1
# Print results in a nice table
print("Results for SVC(kernel=rbf)")
print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
print(classification_report(test_labels, prediction_rbf))
print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(test_labels, prediction_linear))
print("Results for LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(test_labels, prediction_liblinear))
但它失败了
ValueError: Found arrays with inconsistent numbers of samples: [18 44]
我错过了什么?或者也许这是一个完全错误的方法?
任何帮助将不胜感激,
提前感谢尼科。
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
max_df = 0.8,
sublinear_tf=True,
use_idf=True)
string = ['string to predict'] #my string
vector = vectorizer.transform(string) #convert
train_vectors = vectorizer.fit_transform(train_data)
创建矢量化器的新实例,在拟合它之前,使用 transform
方法。只需更改最后两行的顺序,如下所示:
vectorizer = TfidfVectorizer(min_df=5,
max_df = 0.8,
sublinear_tf=True,
use_idf=True)
string = ['string to predict'] #my string
train_vectors = vectorizer.fit_transform(train_data)
vector = vectorizer.transform(string) #convert
即使我还没有弄清楚为什么它会给我完美的结果,我也决定使用不同的方法来对我的文本进行分类(使用 MultinomialNB(,并使用我选择的字符串对其进行测试。我不确定这是否是最好的方法,但它有效,所以我决定发布作为答案:(请注意,并非所有代码行都是必需的(
# -*- coding: utf-8 -*-
import sys
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.datasets import load_files
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
string = sys.argv[1] #the string i'd like to predict
sets = load_files('scikit') #load my personal dataset
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sets.data)
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
clf = MultinomialNB().fit(X_train_tfidf, sets.target)
docs_new = [string]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
print('%r => %s' % (doc, sets.target_names[category])) #print result of prediction
然后从控制台运行script.py "string to predict"
可以对此代码进行很多改进,例如转储经过训练的模型,但对于我的使用来说已经足够了。