值错误:发现样本数不一致的数组 [ 6 1786]

这是我的代码：

from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import datasets
import numpy as np
newsgroups = datasets.fetch_20newsgroups(
                subset='all',
                categories=['alt.atheism', 'sci.space']
         )
X = newsgroups.data
y = newsgroups.target
TD_IF = TfidfVectorizer()
y_scaled = TD_IF.fit_transform(newsgroups, y)
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(y_scaled.size, n_folds=5, shuffle=True, random_state=241) 
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(estimator=clf, param_grid=grid, scoring='accuracy', cv=cv)
gs.fit(X, y_scaled)

我收到错误，我不明白为什么。回溯：

回溯(最近一次调用(：文件
"C：/Users/Roman/PycharmProjects/week_3/assignment_2.py"，第 23 行，在

gs.fit(X， y_scaled( #TODO：检查此行文件 "C：\Users\Roman\AppData\Roaming\Python\Python35\site-packages\sklearn\grid_search.py"，
804 行，适合
返回 self._fit(X， y， ParameterGrid(self.param_grid(( 文件 "C：\Users\Roman\AppData\Roaming\Python\Python35\site-packages\sklearn\grid_search.py"，
525号线，_fit
X， y = indexable(X， y( File "C：\Users\Roman\AppData\Roaming\Python\Python35\site-packages\sklearn\utils\validation.py"，
第 201 行，可
索引 check_consistent_length(*result( 文件 "C：\Users\Roman\AppData\Roaming\Python\Python35\site-packages\sklearn\utils\validation.py"，
176号线，check_consistent_length
"%s" % str(uniques((
值错误：找到样本数不一致的数组： [ 6 1786]

有人可以解释为什么会发生此错误吗？

我想你对这里的X和y有点困惑。您想将X转换为 tf-idf 向量，并使用它来训练 y .见下文

from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import datasets
import numpy as np
newsgroups = datasets.fetch_20newsgroups(
                subset='all',
                categories=['alt.atheism', 'sci.space']
         )
X = newsgroups.data
y = newsgroups.target
TD_IF = TfidfVectorizer()
X_scaled = TD_IF.fit_transform(X, y)
grid = {'C': np.power(10.0, np.arange(-1, 1))}
cv = KFold(y_scaled.size, n_folds=5, shuffle=True, random_state=241) 
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(estimator=clf, param_grid=grid, scoring='accuracy', cv=cv)
gs.fit(X_scaled, y)

相关内容

最新更新

热门标签：