在Python sklearn集成库中,我想使用一些boosting方法(比如Adaboost)来训练我的数据。由于我想知道估计量的最佳数量,我计划每次使用不同数量的估计量进行cv。然而,它似乎以以下方式进行操作是多余的:
for n in [50,100,150,200,250,300]:
model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=n)
cross_val_score(model,x,y,k=5)
因为在AdaBoost中,一旦我在估计器=50的#上训练分类器,当我继续训练估计器=100的#时,前50个分类器及其权重不会改变。我想知道在这种情况下,是否有一种方法可以直接开始与第51名弱学习者进行训练。
可以使用继承对AdaBoostClassifier
进行"破解",该破解不重新训练估计量,并且与sklearn
中的许多交叉验证函数兼容(必须是不混洗数据的交叉验证)。
如果你查看sklearn.ensemble.weight_boosting.py
中的源代码,你可以看到,如果你正确地包装了AdaBoostClassifier.fit()
和AdaBoostClassifier._boost()
的行为,你就不需要重新训练估计器了。
交叉验证函数的问题是,它们使用sklearn.base.clone()
克隆原始估计器,而函数sklearn.base.clone()
则对估计器的参数进行深度复制。深度复制的性质使得估计器不可能在不同的交叉验证运行之间"记住"其估计器(clone()
复制引用的内容,而不是引用本身)。做到这一点的唯一方法(至少是我能想到的唯一方法)是使用全局状态来跟踪运行之间的旧估计量。这里的问题是,你必须计算你的X特性的散列,这可能很昂贵!
无论如何,以下是对AdaBoostClassifier
本身的破解:
'''
adaboost_hack.py
Make a "hack" of AdaBoostClassifier in sklearn.ensemble.weight_boosting.py
that doesn't need to retrain estimators and is compatible with many sklearn
cross validation functions.
'''
import copy
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.base import clone
# Used to hold important variables between runs of cross validation.
# Note that sklearn cross validation functions use sklearn.base.clone()
# to make copies of the estimator sent to it as a function. The function
# sklearn.base.clone() makes deep copies of parameters of an estimator, so
# the only way to provide a way to remember previous estimators between
# cross validation runs is to use a global variable.
#
# We will use hash values of the split of X[:, 0] as keys for remembering
# previous estimators of a cv fold. Note, you can NOT use cross validators
# that randomly shuffle the data before splitting. This will cause different
# hashes.
kfold_hash = {}
class WarmRestartAdaBoostClassifier(AdaBoostClassifier):
'''
Keep track of old estimators, estimator weights, the estimator errors, and
the next to last sample weight seen.
Note that AdaBoostClassifier._boost() does NOT boost the last seen sample
weight. Simple fix to this is to drop the last estimator and retrain it.
Wrap AdaBoostClassifier.fit() to decide whether to throw away estimators or add estimators
depending on the current number of estimators vs the number of old esimators.
Also look at the possibility of use the global kfold_hash to get old values if
use_kfold_hash == True.
Wrap AdaBoostClassifier._boost() with behavior to record the next to last sample weight.
'''
def __init__(self,
base_estimator=None,
n_estimators=50,
learning_rate=1.,
algorithm='SAMME.R',
random_state=None,
next_to_last_sample_weight = None,
old_estimators_ = [],
use_kfold_hash = False):
AdaBoostClassifier.__init__(self, base_estimator, n_estimators, learning_rate,
algorithm, random_state)
self.next_to_last_sample_weight = next_to_last_sample_weight
self._last_sample_weight = None
self.old_estimators_ = old_estimators_
self.use_kfold_hash = use_kfold_hash
def _boost(self, iboost, X, y, sample_weight, random_state):
'''
Record the sample weight.
Parameters and return behavior same as that of AdaBoostClassifier._boost() as
seen in sklearn.ensemble.weight_boosting.py
Parameters
----------
iboost : int
The index of the current boost iteration.
X : {array-like, sparse matrix} of shape = [n_samples, n_features]
The training input samples. Sparse matrix can be CSC, CSR, COO,
DOK, or LIL. COO, DOK, and LIL are converted to CSR.
y : array-like of shape = [n_samples]
The target values (class labels).
sample_weight : array-like of shape = [n_samples]
The current sample weights.
random_state : RandomState
The current random number generator
Returns
-------
sample_weight : array-like of shape = [n_samples] or None
The reweighted sample weights.
If None then boosting has terminated early.
estimator_weight : float
The weight for the current boost.
If None then boosting has terminated early.
error : float
The classification error for the current boost.
If None then boosting has terminated early.
'''
fit_info = AdaBoostClassifier._boost(self, iboost, X, y, sample_weight, random_state)
sample_weight, _, _ = fit_info
self.next_to_last_sample_weight = self._last_sample_weight
self._last_sample_weight = sample_weight
return fit_info
def fit(self, X, y):
hash_X = None
if self.use_kfold_hash:
# Use a hash of X features in this kfold to access the global information
# for this kfold.
hash_X = hash(bytes(X[:, 0]))
if hash_X in kfold_hash.keys():
self.old_estimators_ = kfold_hash[hash_X]['old_estimators_']
self.next_to_last_sample_weight = kfold_hash[hash_X]['next_to_last_sample_weight']
self.estimator_weights_ = kfold_hash[hash_X]['estimator_weights_']
self.estimator_errors_ = kfold_hash[hash_X]['estimator_errors_']
# We haven't done any fits yet.
if not self.old_estimators_:
AdaBoostClassifier.fit(self, X, y)
self.old_estimators_ = self.estimators_
# The case that we throw away estimators.
elif self.n_estimators < len(self.old_estimators_):
self.estimators_ = self.old_estimators_[:self.n_estimators]
self.estimator_weights_ = self.estimator_weights_[:self.n_estimators]
self.estimator_errors_ = self.estimator_errors_[:self.n_estimators]
# The case that we add new estimators.
elif self.n_estimators > len(self.old_estimators_):
n_more = self.n_estimators - len(self.old_estimators_)
self.fit_more(X, y, n_more)
# Record information in the global hash if necessary.
if self.use_kfold_hash:
kfold_hash[hash_X] = {'old_estimators_' : self.old_estimators_,
'next_to_last_sample_weight' : self.next_to_last_sample_weight,
'estimator_weights_' : self.estimator_weights_,
'estimator_errors_' : self.estimator_errors_}
return self
def fit_more(self, X, y, n_more):
'''
Fits additional estimators.
'''
# Since AdaBoostClassifier._boost() doesn't boost the last sample weight, we retrain the last estimator with
# its input sample weight.
self.n_estimators = n_more + 1
if self.old_estimators_ is None:
raise Exception('Should have already fit estimators before calling fit_more()')
self.old_estimators_ = self.old_estimators_[:-1]
old_estimator_weights = self.estimator_weights_[:-1]
old_estimator_errors = self.estimator_errors_[:-1]
sample_weight = self.next_to_last_sample_weight
AdaBoostClassifier.fit(self, X, y, sample_weight)
self.old_estimators_.extend(self.estimators_)
self.estimators_ = self.old_estimators_
self.n_estimators = len(self.estimators_)
self.estimator_weights_ = np.concatenate([old_estimator_weights, self.estimator_weights_])
self.estimator_errors_ = np.concatenate([old_estimator_errors, self.estimator_errors_])
这里有一个示例,允许您将破解的时间/准确性与原始AdaBoostClassifier
进行比较。注意,随着我们添加估计器,测试黑客的时间会增加,但训练不会。我发现破解比原来运行得快得多,但我没有对大量的X样本进行哈希处理。
'''
example.py
Test the AdaBoost hack.
'''
import time # Used to get timing info.
import adaboost_hack
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier # We will use stumps for our classifiers.
from sklearn.ensemble import AdaBoostClassifier # Used to compare hack to original.
from sklearn.model_selection import (cross_val_score, KFold)
from sklearn.metrics import accuracy_score
my_random = np.random.RandomState(0) # For consistent results.
nSamples = 2000
# Make some sample data.
X = my_random.uniform(size = (nSamples, 2))
y = np.zeros(len(X), dtype = int)
# Decision boundary is the unit circle.
in_class = X[:, 0]**2 + X[:, 1]**2 > 1
y = np.zeros(len(X), dtype = int)
y[in_class] = 1
# Add some random error.
error_rate = 0.01
to_flip = my_random.choice(np.arange(len(y)), size = int(error_rate * len(y)), replace = False)
y[to_flip] = 1 - y[to_flip]
# Plot the data.
plt.scatter(X[:, 0], X[:, 1], c = y)
plt.title('Simulated Data')
plt.show()
# Make our hack solution. Initially do 2 estimators.
# Train the hack without testing. Should find nearly constant time per training session.
print('Training hack without testing.')
ada_boost_hack = adaboost_hack.WarmRestartAdaBoostClassifier(DecisionTreeClassifier(max_depth = 1,
random_state = my_random),
n_estimators = 1,
random_state = my_random)
nFit = 50
times = []
for i in range(nFit):
times.append(time.time())
ada_boost_hack.n_estimators += 1
ada_boost_hack.fit(X, y)
def get_differences(times):
times = np.array(times)
return times[1:] - times[:-1]
times_per_train = {'hack no test' : get_differences(times)}
# Now look at running tests while training the hack. Should have small linear growth between
# in time per training session.
print('Training hack with testing.')
ada_boost_hack = adaboost_hack.WarmRestartAdaBoostClassifier(DecisionTreeClassifier(max_depth = 1,
random_state = my_random),
n_estimators = 1,
random_state = my_random)
times = []
scores = []
for i in range(nFit):
times.append(time.time())
ada_boost_hack.n_estimators += 1
ada_boost_hack.fit(X, y)
y_predict = ada_boost_hack.predict(X)
new_score = accuracy_score(y, y_predict)
scores.append(new_score)
plt.plot(scores)
plt.title('Training scores for hack')
plt.ylabel('Accuracy')
plt.show()
times_per_train['hack with test'] = get_differences(times)
print('Now training hack with cross validation')
ada_boost_hack = adaboost_hack.WarmRestartAdaBoostClassifier(DecisionTreeClassifier(max_depth = 1,
random_state = my_random),
n_estimators = 1,
random_state = my_random,
use_kfold_hash = True)
# Now try cross_val_score().
scores = []
times = []
# We use KFold to make sure the hashes of X features of each fold are
# the same between each run.
for i in range(1, nFit + 1):
ada_boost_hack.set_params(n_estimators = i)
new_scores = cross_val_score(ada_boost_hack, X, y, cv = KFold(3))
scores.append(new_scores)
times.append(time.time())
def plot_cv_scores(scores):
scores = np.array(scores)
plt.plot(scores.mean(axis = 1))
plt.plot(scores.mean(axis = 1) + scores.std(axis = 1) * 2, color = 'red')
plt.plot(scores.mean(axis = 1) - scores.std(axis = 1) * 2, color = 'red')
plt.ylabel('Accuracy')
plot_cv_scores(scores)
plt.title('Cross validation scores for hack')
plt.show()
times_per_train['hack cross validation'] = get_differences(times)
# Double check that kfold_hash only has 3 keys since we used cv = 3.
print('adaboost_hack.keys() = ', adaboost_hack.kfold_hash.keys())
# Now get timings for original classifier.
print('Now doing cross validations of original')
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 1,
random_state = np.random.RandomState(0)),
n_estimators = 1,
random_state = np.random.RandomState(0))
times = []
scores = []
# We use KFold to make sure the hashes of X features of each fold are
# the same between each run.
for i in range(1, nFit + 1):
ada_boost.set_params(n_estimators = i)
new_scores = cross_val_score(ada_boost, X, y, cv = KFold(3))
scores.append(new_scores)
times.append(time.time())
plot_cv_scores(scores)
plt.title('Cross validation scores for original')
plt.show()
times_per_train['original cross validation'] = get_differences(times)
# Plot all of the timing data.
for key in times_per_train.keys():
plt.plot(times_per_train[key])
plt.title('Time per training or cv score')
plt.ylabel('Time')
plt.xlabel('nth training or cv score')
plt.legend(times_per_train.keys())
plt.show()
您可以拟合所有300个估计量,然后使用AdaBoostClassifier.staged_predict()
来跟踪错误率如何取决于估计量的数量。但是,您必须自己进行交叉验证拆分;我认为它与cross_val_score()不兼容。
例如,
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier # We will use simple stumps for individual estimators in AdaBoost.
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
nSamples = {'train' : 2000, 'test' : 1000}
X = np.random.uniform(size = (nSamples['train'] + nSamples['test'], 2))
# Decision boundary is the unit circle.
in_class = X[:, 0]**2 + X[:, 1]**2 > 1
y = np.zeros(len(X), dtype = int)
y[in_class] = 1
# Add some random error.
error_rate = 0.01
to_flip = np.random.choice(np.arange(len(y)), size = int(error_rate * len(y)), replace = False)
y[to_flip] = 1 - y[to_flip]
# Split training and test.
X = {'train' : X[:nSamples['train']],
'test' : X[nSamples['train']:]}
y = {'train' : y[:nSamples['train']],
'test' : y[nSamples['train']:]}
# Make AdaBoost Classifier.
max_estimators = 50
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 1, # Just a stump.
random_state = np.random.RandomState(0)),
n_estimators = max_estimators,
random_state = np.random.RandomState(0))
# Fit all estimators.
ada_boost.fit(X['train'], y['train'])
# Get the test accuracy for each stage of prediction.
scores = {'train' : [], 'test' : []}
for y_predict_train, y_predict_test in zip(ada_boost.staged_predict(X['train']),
ada_boost.staged_predict(X['test'])):
scores['train'].append(accuracy_score(y['train'], y_predict_train))
scores['test'].append(accuracy_score(y['test'], y_predict_test))
# Plot the results.
n_estimators = range(1, len(scores['train']) + 1)
for key in scores.keys():
plt.plot(n_estimators, scores[key])
plt.title('Staged Scores')
plt.ylabel('Accuracy')
plt.xlabel('N Estimators')
plt.legend(scores.keys())
plt.show()