predict() 引发 ValueError('train 和有效数据集categorical_feature不匹配。



我使用贝叶斯HPO来优化回归目标的LightGBM模型。为此,我调整了一个分类模板来处理我的数据。到目前为止,样本内拟合有效,但当我尝试使用predict()进行样本外拟合时,我收到一条错误消息。

我的样本外拟合函数如下:

def test_kfold(params, train, train_y, test, cv):
test_preds = 0.
valid_preds = np.zeros(train_y.shape)
for fold, (train_ix, valid_ix) in enumerate(cv.split(train, train_y)):
print(f"nFOLD: {fold+1} {'='*50}")
X_train, X_valid = train.iloc[train_ix], train.iloc[valid_ix]
y_train, y_valid = train_y.iloc[train_ix], train_y.iloc[valid_ix]

lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_valid = lgb.Dataset(X_valid, y_valid, free_raw_data=False)

rlf = lgb.train(params, lgb_train, 5000, valid_sets=[lgb_train, lgb_valid], verbose_eval=250, early_stopping_rounds=50)
valid_preds[valid_ix] = rlf.predict(train.iloc[valid_ix], num_iterations=rlf.best_iteration)
test_preds += rlf.predict(test, num_iterations=rlf.best_iteration)
print(f'Valid CV: {sklearn.metrics.mean_squared_error(train_y, valid_preds, squared = False)}')
test_preds /= cv.n_splits
return test_preds

参数和实际函数调用如下所示:

params = {
'feature_fraction': LGB_BO.max['params']['feature_fraction'],
'lambda_l1': LGB_BO.max['params']['lambda_l1'],
'lambda_l2': LGB_BO.max['params']['lambda_l2'],
'learning_rate': LGB_BO.max['params']['learning_rate'],
'max_depth': int(np.round(LGB_BO.max['params']['max_depth'])),
'min_data_in_leaf': int(np.round(LGB_BO.max['params']['min_data_in_leaf'])),
'min_gain_to_split': LGB_BO.max['params']['min_gain_to_split'],
'min_sum_hessian_in_leaf': LGB_BO.max['params']['min_sum_hessian_in_leaf'],
'num_leaves': int(np.round(LGB_BO.max['params']['num_leaves'])),
'max_bin': 255,
'bagging_fraction': 0.8,
'bagging_freq': 3,
'save_binary': True,
'seed': 7,
'feature_fraction_seed': 7,
'bagging_seed': 7,
'drop_seed': 7,
'data_random_seed': 7,
'objective': 'regression',
'boosting_type': 'gbdt',
'verbose': 1,
'metric': 'rmse',
'is_unbalance': True,
'boost_from_average': True,
'n_jobs': -1
}
# 5-Fold testing
with warnings.catch_warnings():
warnings.filterwarnings('ignore')    
test_preds = test_kfold(params, train, train_y, test, KFold(n_splits=5, random_state=7, shuffle=True))

调用函数会引发以下错误:

In [181]: test_preds += rlf.predict(test.drop(['avg_power'], axis = 1), num_iterations=rlf.best_iteration)
Traceback (most recent call last):

File "C:UsersUserAppDataLocalTemp/ipykernel_16312/2582764377.py", line 1, in <module>
test_preds += rlf.predict(test.drop(['avg_power'], axis = 1), num_iterations=rlf.best_iteration)

File "C:UsersUserAppDataLocalTemp/ipykernel_16312/2582764377.py", line 1, in <module>
test_preds += rlf.predict(test.drop(['avg_power'], axis = 1), num_iterations=rlf.best_iteration)

File "C:UsersUseranaconda3libsite-packageslightgbmbasic.py", line 3538, in predict
return predictor.predict(data, start_iteration, num_iteration,

File "C:UsersUseranaconda3libsite-packageslightgbmbasic.py", line 820, in predict
data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]

File "C:UsersUseranaconda3libsite-packageslightgbmbasic.py", line 575, in _data_from_pandas
raise ValueError('train and valid dataset categorical_feature do not match.')

ValueError: train and valid dataset categorical_feature do not match.

将训练数据集拆分为包含感兴趣变量的单个数组train_y和包含用于回归的所有其他变量的数据名训练。测试数据集在一个数据集中包含train和train_y的所有特征。

我希望这些信息足以澄清这个问题。遗憾的是,我无法共享实际数据集。

对我来说,它是通过从测试中删除列车期间看不见的类别来解决的。

下面是一个变压器的例子,可以解决这个问题。因为在transform期间,序列集中的所有未知类别都将被Other替换,所以LightGBM不会遇到看不见的类别值。

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class TakeTopK(BaseEstimator, TransformerMixin):
"""
Select top K categorical features in each column, and replace all other values with `Other`.
"""
def __init__(self, k=20, cat_feat, cont_feat):
self.largest_cat = {}
self.k = k
self.cat_feat = cat_feat
self.cont_feat = cont_feat

def fit(self, X, y=None):
for col in self.cat_feat:
self.largest_cat[col] = X[col].value_counts().nlargest(self.k).index
return self

def transform(self, X, y=None):
Xt = pd.DataFrame()
for col in self.cat_feat:
# LightGBM would pick up the `'category'` type, so it does not need to be provided explicitly to the classifier.
Xt[col] = pd.Series(np.where(X[col].isin(self.largest_cat[col]), X[col], 'Other'), dtype='category')
Xt[self.cont_feat] = X[self.cont_feat].astype(float)
return Xt


pipeline = Pipeline([
('top_k', TakeTopK(50)),
('clf', lgb.LGBMModel())    
])

最新更新