查找并使用XGBoost回归管道中的前10个功能



我想用XGBRegressorft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)获得前10个功能。但我怎么能在我的管道中使用它呢?

我有这个类FeatureSelector_Only_Top_10,我怎么能只使用前10个功能,然后打印出来?例如print(grid.feature_selection_top_10.top10features)

进口:

import time
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

XGB:

xgb_reg_start = time.time()
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_nor, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train_nor)
val_preds_xgb_reg = xgb_reg.predict(X_test_nor)
xgb_reg_end = time.time()
print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
print("nTraining MSE:", round(metrics.mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(metrics.mean_squared_error(y_test, val_preds_xgb_reg),4))
print("nTraining r2:", round(metrics.r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(metrics.r2_score(y_test, val_preds_xgb_reg),4))
ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True)
ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)

管道:

class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components

def fit(self, X, y = None):
# Don't know
return self
def transform(self, X, y = None):
# Don't know
return X
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)

steps = [#('feature_selection_top_10', FeatureSelector_Only_Top_10()),
#('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps) 
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)                
grid.fit(X_train, y_train)
print(grid.best_params_)                    
print("score = %3.2f" %(grid.score(X_test,y_test)))

如果您想在Pipeline中选择数据集的N最佳功能,您应该定义一个自定义Transformer。

该对象应在transform()方法期间训练并从xgboost中选择N最佳特性。然后,在transform()方法期间,此转换器应该相应地过滤数据集。

我会这样做:

from sklearn.datasets import make_regression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Lasso
import pandas as pd
import xgboost as xgb
class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components
self.top_n_features = None
def fit(self, X, y = None):
X = pd.DataFrame(X)
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X, y)
self.top_n_features = (pd.DataFrame(
xgb_reg.feature_importances_,
columns=['weight'],
index=X.columns)
.sort_values(by='weight', ascending=False)
.head(10)
)
return self
def transform(self, X, y = None):
return pd.DataFrame(X).filter(self.top_n_features.index)


X, y = make_regression(n_features=50)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)

steps = [('feature_selection_top_10', FeatureSelector_Only_Top_10()),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps) 
pipeline.fit(X, y)       
print("score = %3.2f" %(pipeline.score(X_test,y_test)))
#retrieve the top N features and their weights
pipeline['feature_selection_top_10'].top_n_features

您可以在管道中包含SelectFromModel,以便根据其重要性权重提取前10个特征,无需创建自定义转换器。如文档中所述,如果要选择10个功能,则需要设置max_features=10threshold=-np.inf

import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
X, y = make_regression(n_features=100, n_samples=1000, random_state=42)
X = pd.DataFrame(data=X, columns=['x' + str(i) for i in range(X.shape[1])])
y = pd.Series(y, name='y')
pipeline = Pipeline([
('selector', SelectFromModel(estimator=XGBRegressor(), max_features=10, threshold=-np.inf)),
('regressor', LinearRegression())
])
pipeline.fit(X, y)
selected_features = pipeline['selector'].get_support()
print(selected_features.sum())
# 10
selected_features_names = X.columns[selected_features].tolist()
print(selected_features_names)
# ['x0', 'x14', 'x17', 'x35', 'x42', 'x43', 'x57', 'x71', 'x84', 'x95']
selected_features_importances = pipeline['selector'].estimator_.feature_importances_[selected_features]
print(selected_features_importances)
# [0.09361505 0.18474296 0.14420615 0.01952794 0.10946904 0.02192107 0.03307951 0.02948984 0.02851948 0.1216883]
selected_features_coefficients = pipeline['regressor'].coef_
print(selected_features_coefficients)
# [49.43000693 83.91437854 78.25242596 -0.76411769 56.67970515  0.16829694 28.81967319  0.50277914 24.55006237 68.17120687]

最新更新