>我使用下面的方法来获取估计器的输出。
有没有更快的方法来交叉验证分数?
for clfx, label in zip([clf0], ['Random Forest']):
scores = cross_validation.cross_val_score(clfx, X, y, cv=5, scoring='accuracy')
print "Accuracy : %0.3f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)
scores = cross_validation.cross_val_score(clfx, X, y, cv=5, scoring='precision')
print "Precision: %0.3f (+/- %0.2f) [%s] " % (scores.mean(), scores.std(), label)
scores = cross_validation.cross_val_score(clfx, X, y, cv=5, scoring='recall')
print "Recall : %0.3f (+/- %0.2f) [%s] n" % (scores.mean(), scores.std(), label)
输出:
Accuracy : 0.82 (+/- 0.00) [Random Forest]
Precision: 0.50 (+/- 0.02) [Random Forest]
Recall : 0.13 (+/- 0.01) [Random Forest]
这是矫枉过正,我应该使用一次运行的混淆矩阵吗?
不幸的是,如果您想合并指标,我认为您必须"手动"运行交叉验证迭代:
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.cross_validation import KFold
all_scores = {'precision':[], 'recall':[], 'accuracy': []}
for train, test in KFold(n = len(X)):
clfx.fit(X[train, :],y[train])
y_pred = clfx.predict(X[test])
all_scores['precision'] += precision_score(y_pred, y[test])
all_scores['accuracy'] += accuracy_score(y_pred, y[test])
all_scores['recall'] += recall_score(y_pred, y[test])
scores = all_scores['accuracy']
print ("Accuracy : %0.3f (+/- %0.2f) [%s]" % (np.mean(scores), np.std(scores), label))
scores = all_scores['precision']
print ("Precision: %0.3f (+/- %0.2f) [%s] " % (np.mean(scores), np.std(scores), label))
scores = all_scores['recall']
print ("Recall : %0.3f (+/- %0.2f) [%s] n" % (np.mean(scores), np.std(scores), label))
如果你愿意,你也可以使用multiprocess
来并行化它(这是使用scikit-learn交叉验证函数的主要优点之一):
from multiprocessing import Pool
def score(cv_split, clfx=clfx, X=X, y=y):
train, test = cv_split
clfx.fit(X[train, :],y[train])
y_pred = clfx.predict(X[test])
all_scores = {}
all_scores['precision'] = precision_score(y_pred, y[test])
all_scores['accuracy'] = accuracy_score(y_pred, y[test])
all_scores['recall'] = recall_score(y_pred, y[test])
return all_scores
p = Pool(6)
scores_by_run = p.map(score, KFold(len(X)))
all_scores = {k:[d[k] for d in scores_by_run] for k in scores_by_run[0].keys()}