使用 XGBoost 和 hyperopt 进行交叉验证和参数调整



使用 XGB 模型进行嵌套交叉验证的一种方法是:

from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBClassifier
# Let's assume that we have some data for a binary classification
# problem : X (n_samples, n_features) and y (n_samples,)...
gs = GridSearchCV(estimator=XGBClassifier(), 
param_grid={'max_depth': [3, 6, 9], 
'learning_rate': [0.001, 0.01, 0.05]}, 
cv=2)
scores = cross_val_score(gs, X, y, cv=2)

但是,关于 XGB 参数的调整,一些教程(例如本教程(利用了 Python hyperopt 库。我希望能够使用hyperopt进行嵌套交叉验证(如上所述(来调整XGB参数。

为此,我编写了自己的Scikit-Learn估算器:

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.exceptions import NotFittedError
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

def optimize_params(X, y, params_space, validation_split=0.2):
"""Estimate a set of 'best' model parameters."""
# Split X, y into train/validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=validation_split, stratify=y)
# Estimate XGB params
def objective(_params):
_clf = XGBClassifier(n_estimators=10000,
max_depth=int(_params['max_depth']),
learning_rate=_params['learning_rate'],
min_child_weight=_params['min_child_weight'],
subsample=_params['subsample'],
colsample_bytree=_params['colsample_bytree'],
gamma=_params['gamma'])
_clf.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_val, y_val)],
eval_metric='auc',
early_stopping_rounds=30)
y_pred_proba = _clf.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_true=y_val, y_score=y_pred_proba)
return {'loss': 1. - roc_auc, 'status': STATUS_OK}
trials = Trials()
return fmin(fn=objective,
space=params_space,
algo=tpe.suggest,
max_evals=100,
trials=trials,
verbose=0)

class OptimizedXGB(BaseEstimator, ClassifierMixin):
"""XGB with optimized parameters.
Parameters
----------
custom_params_space : dict or None
If not None, dictionary whose keys are the XGB parameters to be
optimized and corresponding values are 'a priori' probability
distributions for the given parameter value. If None, a default
parameters space is used.
"""
def __init__(self, custom_params_space=None):
self.custom_params_space = custom_params_space
def fit(self, X, y, validation_split=0.3):
"""Train a XGB model.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Data.
y : ndarray, shape (n_samples,) or (n_samples, n_labels)
Labels.
validation_split : float (default: 0.3)
Float between 0 and 1. Corresponds to the percentage of samples in X which will be used as validation data to estimate the 'best' model parameters.
"""
# If no custom parameters space is given, use a default one.
if self.custom_params_space is None:
_space = {
'learning_rate': hp.uniform('learning_rate', 0.0001, 0.05),
'max_depth': hp.quniform('max_depth', 8, 15, 1),
'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1),
'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
'gamma': hp.quniform('gamma', 0.9, 1, 0.05),
'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 0.7, 0.05)
}
else:
_space = self.custom_params_space
# Estimate best params using X, y
opt = optimize_params(X, y, _space, validation_split)
# Instantiate `xgboost.XGBClassifier` with optimized parameters
best = XGBClassifier(n_estimators=10000,
max_depth=int(opt['max_depth']),
learning_rate=opt['learning_rate'],
min_child_weight=opt['min_child_weight'],
subsample=opt['subsample'],
gamma=opt['gamma'],
colsample_bytree=opt['colsample_bytree'])
best.fit(X, y)
self.best_estimator_ = best
return self
def predict(self, X):
"""Predict labels with trained XGB model.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Returns
-------
output : ndarray, shape (n_samples,) or (n_samples, n_labels)
"""
if not hasattr(self, 'best_estimator_'):
raise NotFittedError('Call `fit` before `predict`.')
else:
return self.best_estimator_.predict(X)
def predict_proba(self, X):
"""Predict labels probaiblities with trained XGB model.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Returns
-------
output : ndarray, shape (n_samples,) or (n_samples, n_labels)
"""
if not hasattr(self, 'best_estimator_'):
raise NotFittedError('Call `fit` before `predict_proba`.')
else:
return self.best_estimator_.predict_proba(X)

我的问题是:

  • 这是一种有效的方法吗?例如,在我的OptimizedXGBfit方法中,best.fit(X, y)将在X,y上训练XGB模型。但是,这可能会导致过度拟合,因为没有指定eval_set来确保提前停止。
  • 在一个玩具示例(著名的鸢尾花数据集(上,这个OptimizedXGB的表现比基本的 LogisticRegression 分类器差。为什么?是因为例子过于简单吗?有关示例的代码,请参见下文。

示例

import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
X, y = load_iris(return_X_y=True)
X = X[:, :2]
X = X[y < 2]
y = y[y < 2]
skf = StratifiedKFold(n_splits=2, random_state=42)
# With a LogisticRegression classifier
pipe = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())])
gs = GridSearchCV(estimator=pipe, param_grid={'lr__C': [1., 10.]})
lr_scores = cross_val_score(gs, X, y, cv=skf)
# With OptimizedXGB
xgb_scores = cross_val_score(OptimizedXGB(), X, y, cv=skf)
# Print results
print('Accuracy with LogisticRegression = %.4f (+/- %.4f)' % (np.mean(lr_scores), np.std(lr_scores)))
print('Accuracy with OptimizedXGB = %.4f (+/- %.4f)' % (np.mean(xgb_scores), np.std(xgb_scores)))

输出:

Accuracy with LogisticRegression = 0.9900 (+/- 0.0100)
Accuracy with OptimizedXGB = 0.9100 (+/- 0.0300)

虽然分数很接近,但我本以为XGB模型的得分至少与LogisticRegression分类器一样好。

编辑:

  • 类似帖子

首先,检查这篇文章 - 可能会有所帮助 - 嵌套的简历。

关于您的问题:

  1. 是的,这是正确的方法。选择超参数后,应在整个训练数据上拟合模型(所选模型(。但是,由于此模型内部包含模型选择过程,因此您只能像您一样使用外部简历"评分"其泛化程度。
  2. 由于您也在对选择过程进行评分(而不仅仅是模型,例如 XGB 与线性回归(,因此选择过程可能存在一些问题。也许您的超空间定义不正确,并且您选择了糟糕的参数?

最新更新