如何对多个ML模型执行网格搜索



通常我们使用GridSearchCV对一个特定模型的超参数执行网格搜索,例如:

model_ada = AdaBoostClassifier()
params_ada = {'n_estimators':[10,20,30,50,100,500,1000], 'learning_rate':[0.5,1,2,5,10]}
grid_ada = GridSearchCV(estimator = model_ada, param_grid = params_ada, scoring = 'accuracy', cv = 5, verbose = 1, n_jobs = -1)
grid_ada.fit(X_train, y_train)

是否有任何技术或功能允许我们对ML模型本身执行网格搜索?例如,我想做如下所示:

models = {'model_gbm':GradientBoostingClassifier(), 'model_rf':RandomForestClassifier(), 'model_dt':DecisionTreeClassifier(), 'model_svm':SVC(), 'model_ada':AdaBoostClassifier()}
params_gbm = {'learning_rate':[0.1,0.2,0.3,0.4], 'n_estimators':[50,100,500,1000,2000]}
params_rf = {'n_estimators':[50,100,500,1000,2000]}
params_dt = {'splitter':['best','random'], 'max_depth':[1, 5, 10, 50, 100]}
params_svm = {'C':[1,2,5,10,50,100,500], 'kernel':['rbf','poly','sigmoid','linear']}
params_ada = {'n_estimators':[10,20,30,50,100,500,1000], 'learning_rate':[0.5,1,2,5,10]}
params = {'params_gbm':params_gbm, 'params_rf':params_rf, 'params_dt':params_dt, 'params_svm':params_svm, 'params_ada':params_ada}
grid_ml = "that function"(models = models, params = params)
grid_ml.fit(X_train, y_train)

其中";该函数";是我需要用来执行这种操作的函数。

即使我也面临类似的问题,但找不到一个预定义的包/方法来实现这一点。因此,我编写了自己的函数来实现这一点:

def Algo_search(models , params):
max_score = 0
max_model = None
max_model_params = None
for i,j in zip(models.keys() , models.values() ):
gs = GridSearchCV(estimator=j,param_grid=params[i])
a = gs.fit(X_train,y_train)
score = gs.score(X_test,y_test)
if score > max_score:
max_score = score
max_model = gs.best_estimator_
max_model_params = gs.best_params_
return max_score, max_model, max_model_params
#Data points
models = {'model_gbm':GradientBoostingClassifier(), 'model_rf':RandomForestClassifier(), 
'model_dt':DecisionTreeClassifier(), 'model_svm':SVC(), 'model_ada':AdaBoostClassifier()}
params_gbm = {'learning_rate':[0.1,0.2,0.3,0.4], 'n_estimators':[50,100,500,1000,2000]}
params_rf = {'n_estimators':[50,100,500,1000,2000]}
params_dt = {'splitter':['best','random'], 'max_depth':[1, 5, 10, 50, 100]}
params_svm = {'C':[1,2,5,10,50,100,500], 'kernel':['rbf','poly','sigmoid','linear']}
params_ada = {'n_estimators':[10,20,30,50,100,500,1000], 'learning_rate':[0.5,1,2,5,10]}
params = {'model_gbm':params_gbm, 'model_rf':params_rf, 'model_dt':params_dt, 'model_svm':params_svm, 'model_ada':params_ada}
grid_ml = Algo_search(models = models, params = params)

执行多个GridSearchCV然后比较结果应该很简单。

以下是关于如何实现这一点的完整示例

请注意,还有改进的余地,我将留给你。然而,这只是为了给你一些想法的见解。

from sklearn import datasets
from sklearn.ensemble import GradientBoostingClassifier, 
RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

def get_param(model_name, params):
"""
Not the most sufficient way.
I recommend to have params and models
in OrderedDict() instead.
"""
for k, v in params.items():
mn = str(model_name).upper().split('_')
for k_ in str(k).upper().split('_'):
if k_ in mn:
return v

def models_gridSearchCV(models, params, scorer, X, y):
all_results = dict.fromkeys(models.keys(), [])
best_model = {'model_name': None,
'best_estimator': None,
'best_params': None,
'best_score': -9999999}
for model_name, model in models.items():
print("Processing {} ...".format(model_name))
# or use OrderedDict() and zip(models, params) above
# so there will be no need to check
param = get_param(model_name, params)
if param is None:
continue
clf = GridSearchCV(model, param, scoring=scorer)
clf.fit(X, y)
all_results[model_name] = clf.cv_results_
if clf.best_score_ > best_model.get('best_score'):
best_model['model_name'] = model_name
best_model['best_estimator'] = clf.best_estimator_
best_model['best_params'] = clf.best_params_
best_model['best_score'] = clf.best_score_
return best_model, all_results

### TEST ###
iris = datasets.load_iris()
X, y = iris.data, iris.target
# OrderedDict() is recommended here
# to maintain order between models and params 
models = {'model_gbm': GradientBoostingClassifier(),
'model_rf': RandomForestClassifier(),
'model_dt': DecisionTreeClassifier(),
'model_svm': SVC(),
'model_ada': AdaBoostClassifier()}
params_gbm = {'learning_rate': [0.1, 0.2], 'n_estimators': [50, 100]}
params_rf = {'n_estimators': [50, 100]}
params_dt = {'splitter': ['best', 'random'], 'max_depth': [1, 5]}
params_svm = {'C': [1, 2, 5], 'kernel': ['rbf', 'linear']}
params_ada = {'n_estimators': [10, 100], 'learning_rate': [0.5, 1]}
# OrderedDict() is recommended here
# to maintain order between models and params 
params = {'params_gbm': params_gbm,
'params_rf': params_rf,
'params_dt': params_dt,
'params_svm': params_svm,
'params_ada': params_ada}
best_model, all_results = models_gridSearchCV(models, params, 'accuracy', X, y)
print(best_model)
# print(all_results)

结果

Processing model_gbm ...
Processing model_rf ...
Processing model_dt ...
Processing model_svm ...
Processing model_ada ...
{'model_name': 'model_svm', 'best_estimator': SVC(C=5), 
'best_params': {'C': 5, 'kernel': 'rbf'}, 'best_score': 0.9866666666666667}