如何修复此值错误:标签数=14与样本数=56不匹配



Data:

rid age     income  student credit_rating   class_buy_computer
1   young   high    no  fair    no
2   young   high    no  excellent   no
3   middle  high    no  fair    yes
4   senior  medium  no  fair    yes
5   senior  low yes fair    yes
6   senior  low yes excellent   no
7   middle  low yes excellent   yes
8   young   medium  no  fair    yes
9   young   low yes fair    yes
10  senior  medium  yes fair    yes
11  young   medium  yes excellent   yes
12  middle  medium  no  excellent   yes
13  middle  high    yes fair    yes
14  senior  medium  no  excellent   no

法典:

from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
myData = open(r'C:UsersUSERDesktoptest.csv')
reader = csv.reader(myData)
headers=next(reader)
print (headers)
featuelist=[]
labeList=[]
for row in reader:
    labeList.append(row[len(row)-1])
    rowDict={}
    for i in range(1,len(row)-1):
        rowDict[headers[i]]=row[i]
        featuelist.append(rowDict)
print(featuelist)
vec=DictVectorizer()
dummyX=vec.fit_transform(featuelist).toarray()
print('dummyX:'+str(dummyX))
print(vec.get_feature_names())
print('labeList:'+str(labeList))
lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labeList)
print('dummyY:'+str(dummyY))
clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX,dummyY)
print('clf:'+str(clf))

我收到此错误:

  File "<ipython-input-20-eacaea56a8a9>", line 1, in <module>
    runfile('C:/Users/USER/Desktop/test.py', wdir='C:/Users/USER/Desktop')
  File "D:toolspythonlibsite-packagesspyderutilssitesitecustomize.py", line 710, in runfile
  File "D:toolspythonlibsite-packagesspyderutilssitesitecustomize.py", line 101, in execfile
  File "C:/Users/USER/Desktop/test.py", line 32, in <module>
    clf=clf.fit(dummyX,dummyY)
  File "D:toolspythonlibsite-packagessklearntreetree.py", line 790, in fit
    X_idx_sorted=X_idx_sorted)
  File "D:toolspythonlibsite-packagessklearntreetree.py", line 236, in fit
    "number of samples=%d" % (len(y), n_samples))
ValueError: Number of labels=14 does not match number of samples=56

这仅仅是因为每一行在featuelist字典中添加了 4 次。线路featuelist.append(rowDict)不应位于第二个循环内。

from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
myData = open('/home/kashif/test.csv')
reader = csv.reader(myData)
headers=next(reader)
print (headers)
featuelist=[]
labeList=[]
for row in reader:
    labeList.append(row[len(row)-1])
    rowDict={}
    for i in range(1,len(row)-1):
        rowDict[headers[i]]=row[i]
    #Make sure the below line is not inside the second loop
    featuelist.append(rowDict)  #<--This was the typo. 
print(featuelist)    
vec=DictVectorizer(sparse=False)
dummyX=vec.fit_transform(featuelist)
print('dummyX:'+str(dummyX))
print(vec.get_feature_names())
print('labeList:'+str(labeList))
lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labeList)
print('dummyY:'+str(dummyY))
clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX,dummyY)
print('clf:'+str(clf))

输出:

clf:DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

相关内容

  • 没有找到相关文章