shape: TreeExplainer尚未支持的模型类型:<class 'sklearn.model_s



有人能告诉我如何修复这个代码(一个可重复的例子):

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve 
import matplotlib.pyplot as plt 
import pickle
import pandas as pd
import shap
X_train,y_train = make_classification(n_samples =1000,n_features = 20, random_state=1, n_informative=10,n_redundant=10)

def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 
cv_outer = KFold(n_splits=3,shuffle=True,random_state=1)
list_shap_values = list()
list_test_sets = list()
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train.iloc[train_ix,:],X_train.iloc[test_ix,:]               
split_y_train, split_y_test = y_train.iloc[train_ix],y_train.iloc[test_ix]
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, scoring='accuracy',cv=cv_inner)
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
search = GridSearchCV(pipeline,param_grid=param_grid,scoring='accuracy',cv = cv_inner,refit=True)
result = search.fit(split_x_train,split_y_train)
best_model = result.best_estimator_
yhat = best_model.predict(split_x_test)
explainer = shap.TreeExplainer(result) #or when I change this to best_model, I get the same error
shap_values = explainer.shap_values(split_x_test)
list_shap_values.append(shap_values)
list_test_sets.append(test_ix)
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
X_test_df = pd.DataFrame(full_X_train[test_set],columns=columns)
shap_sum = np.abs(shap_values[1,:,:]).mean(0)
cols = X_test_df.columns
importance_df = pd.DataFrame({
'column_name':cols,
'shap_values':sv
}) 

importance_df.sort_values('shap_values',ascending=False)
print(importance_df)
return

param_grid = [{
'clf_cv__min_samples_leaf':[1,3,5],
}]
run_model_with_grid_search(param_grid=param_grid)

从生成:

Model type not yet supported by TreeExplainer: <class 'sklearn.model_selection._search.GridSearchCV'

您正在尝试将GridSearchCV对象传递给shap.TreeExplainer

你可以通过传递gridsearch的最佳拟合模型来解决这个问题。

:

cv = (
GridSearchCV(
model,
param_grid=param_grid,
scoring='accuracy',
cv = cv_inner,
refit=True)
.fit(split_x_train,split_y_train)
)
explainer = shap.TreeExplainer(cv.best_estimator_)
explainer = shap.TreeExplainer(best_model[1])

可能是你正在寻找的(最佳管道,阶段1)。

你的result.best_estimator_是这样一个复杂的管道形状。TreeExplainer不支持

Pipeline(steps=[
('feature_sele', RFECV(cv=KFold(), estimator=RandomForestClassifier(), scoring='accuracy')),
('clf_cv', RandomForestClassifier())
])

要使其工作,您必须"提取"拟合模型

# Features effectively used in your best model
fitted_rfecv = result.best_estimator_[0]
selected_features = fitted_rfecv.support_
print(selected_features)  # [False False False  True ...]
# Test data with only selected_features
split_x_test_selected_features = split_x_test[:, selected_features]
print(split_x_test_selected_features.shape)
# Best model (fitted only selected features)
rf_model_fitted = result.best_estimator_[1]
print(rf_model_fitted)  # RandomForestClassifier()
explainer = shap.TreeExplainer(rf_model_fitted)
shap_values = explainer.shap_values(split_x_test_selected_features)

注意split_x_test_selected_features中的特性顺序

相关内容

最新更新