Data:
rid age income student credit_rating class_buy_computer
1 young high no fair no
2 young high no excellent no
3 middle high no fair yes
4 senior medium no fair yes
5 senior low yes fair yes
6 senior low yes excellent no
7 middle low yes excellent yes
8 young medium no fair yes
9 young low yes fair yes
10 senior medium yes fair yes
11 young medium yes excellent yes
12 middle medium no excellent yes
13 middle high yes fair yes
14 senior medium no excellent no
法典:
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
myData = open(r'C:UsersUSERDesktoptest.csv')
reader = csv.reader(myData)
headers=next(reader)
print (headers)
featuelist=[]
labeList=[]
for row in reader:
labeList.append(row[len(row)-1])
rowDict={}
for i in range(1,len(row)-1):
rowDict[headers[i]]=row[i]
featuelist.append(rowDict)
print(featuelist)
vec=DictVectorizer()
dummyX=vec.fit_transform(featuelist).toarray()
print('dummyX:'+str(dummyX))
print(vec.get_feature_names())
print('labeList:'+str(labeList))
lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labeList)
print('dummyY:'+str(dummyY))
clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX,dummyY)
print('clf:'+str(clf))
我收到此错误:
File "<ipython-input-20-eacaea56a8a9>", line 1, in <module>
runfile('C:/Users/USER/Desktop/test.py', wdir='C:/Users/USER/Desktop')
File "D:toolspythonlibsite-packagesspyderutilssitesitecustomize.py", line 710, in runfile
File "D:toolspythonlibsite-packagesspyderutilssitesitecustomize.py", line 101, in execfile
File "C:/Users/USER/Desktop/test.py", line 32, in <module>
clf=clf.fit(dummyX,dummyY)
File "D:toolspythonlibsite-packagessklearntreetree.py", line 790, in fit
X_idx_sorted=X_idx_sorted)
File "D:toolspythonlibsite-packagessklearntreetree.py", line 236, in fit
"number of samples=%d" % (len(y), n_samples))
ValueError: Number of labels=14 does not match number of samples=56
这仅仅是因为每一行在featuelist
字典中添加了 4 次。线路featuelist.append(rowDict)
不应位于第二个循环内。
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
myData = open('/home/kashif/test.csv')
reader = csv.reader(myData)
headers=next(reader)
print (headers)
featuelist=[]
labeList=[]
for row in reader:
labeList.append(row[len(row)-1])
rowDict={}
for i in range(1,len(row)-1):
rowDict[headers[i]]=row[i]
#Make sure the below line is not inside the second loop
featuelist.append(rowDict) #<--This was the typo.
print(featuelist)
vec=DictVectorizer(sparse=False)
dummyX=vec.fit_transform(featuelist)
print('dummyX:'+str(dummyX))
print(vec.get_feature_names())
print('labeList:'+str(labeList))
lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labeList)
print('dummyY:'+str(dummyY))
clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX,dummyY)
print('clf:'+str(clf))
输出:
clf:DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')