list=[]
ct = 1
import numpy as np
import os, os.path
isfile = os.path.isfile
join = os.path.join
fn = 'C:\Users\Keshav\Desktop\xyz\data1\black_and_white\'
target = np.array([1, 2, 3, 4, 5])
num = sum(1 for item in os.listdir(fn) if isfile(join(fn, item)))
for ct in range(1,num+1):
f = open(fn+"1_"+str(ct)+".dat","r")
list.append(f)
ct = ct + 1
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(input="file")
X_train_counts = count_vect.fit_transform(list)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#print X_train_tfidf.shape
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, target)
#clf.fit(X, y)
docs_new = ['10 years of marriage and now divorce. I just wasted my entire life too with her.']
X_new_counts = count_vect.transform(docs_new)
print X_new_counts
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
print predicted
我正在尝试使用以下链接使用 sklearn 构建一个多类分类器。
这里使用的分类器是多项式朴素贝叶斯分类器。
我收到以下错误:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:UsersKeshavAnacondalibsite-packagesspyderlibwidgetsexternalshellsitecustomize.py", line 580, in runfile
execfile(filename, namespace)
File "C:/Users/Keshav/Desktop/iHeal/mturk-distortions/main1.py", line 40, in <module>
X_new_counts = count_vect.transform(docs_new)
File "C:UsersKeshavAnacondalibsite-packagessklearnfeature_extractiontext.py", line 867, in transform
_, X = self._count_vocab(raw_documents, fixed_vocab=True)
File "C:UsersKeshavAnacondalibsite-packagessklearnfeature_extractiontext.py", line 748, in _count_vocab
for feature in analyze(doc):
File "C:UsersKeshavAnacondalibsite-packagessklearnfeature_extractiontext.py", line 234, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
File "C:UsersKeshavAnacondalibsite-packagessklearnfeature_extractiontext.py", line 109, in decode
doc = doc.read()
AttributeError: 'str' object has no attribute 'read'
知道如何解决吗?
docs_new = ['10 years of marriage and now divorce. I just wasted my entire life too with her.']
是一个字符串列表 - 这不是count_vect.transform
想要的;它想要一个带有read
方法的类似文件的对象列表。
因此,import StringIO
模块顶部并添加
docs_new = [ StringIO.StringIO(x) for x in docs_new ]
就在你第一次分配到docs_new
之后,你 shd 没事......