功能引擎 RareLabelEncoder: ValueError: 无法将字符串转换为浮点数: 'Rare'


from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import RareLabelEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import BoxCoxTransformer, PowerTransformer
high_card_cols = ['brand', 'model', 'location']
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
processor = make_column_transformer(
(RareLabelEncoder(n_categories = 9), ['brand', 'model']), #to group rare cateogircal observations
(MeanEncoder(), high_card_cols), # To encode categorical observations with target mean
(OrdinalEncoder(), cat_cols), #to encode low cardinal variables
(PowerTransformer(), ['milage_kmpl']), # transform continuous variables using Exponential transformation
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),# transform continuous variables using BoxCox    
remainder = "passthrough"
)

我目前正在执行一个回归任务。我有两个分类列,它们具有高基数和罕见的观察值。我创建了一个管道,其中包括rarelabelencoder,然后是meanencoder和其他编码器。

当我尝试拟合一个简单的线性回归模型时,我得到以下错误:

ValueError: could not convert string to float: 'Rare'

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
lr_pipe = make_pipeline(
(processor),
(StandardScaler()), 
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-91-1c31eaf7c59a> in <module>
8 )
9 
---> 10 lr_pipe.fit(X_train, y_train.price)
~anaconda3libsite-packagessklearnpipeline.py in fit(self, X, y, **fit_params)
339         """
340         fit_params_steps = self._check_fit_params(**fit_params)
--> 341         Xt = self._fit(X, y, **fit_params_steps)
342         with _print_elapsed_time('Pipeline',
343                                  self._log_message(len(self.steps) - 1)):
~anaconda3libsite-packagessklearnpipeline.py in _fit(self, X, y, **fit_params_steps)
301                 cloned_transformer = clone(transformer)
302             # Fit or load from cache the current transformer
--> 303             X, fitted_transformer = fit_transform_one_cached(
304                 cloned_transformer, X, y, None,
305                 message_clsname='Pipeline',
~anaconda3libsite-packagesjoblibmemory.py in __call__(self, *args, **kwargs)
350 
351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
353 
354     def call_and_shelve(self, *args, **kwargs):
~anaconda3libsite-packagessklearnpipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752     with _print_elapsed_time(message_clsname, message):
753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
755         else:
756             res = transformer.fit(X, y, **fit_params).transform(X)
~anaconda3libsite-packagessklearnbase.py in fit_transform(self, X, y, **fit_params)
700         else:
701             # fit method of arity 2 (supervised transformation)
--> 702             return self.fit(X, y, **fit_params).transform(X)
703 
704 
~anaconda3libsite-packagessklearnpreprocessing_data.py in fit(self, X, y, sample_weight)
728         # Reset internal state before fitting
729         self._reset()
--> 730         return self.partial_fit(X, y, sample_weight)
731 
732     def partial_fit(self, X, y=None, sample_weight=None):
~anaconda3libsite-packagessklearnpreprocessing_data.py in partial_fit(self, X, y, sample_weight)
764         """
765         first_call = not hasattr(self, "n_samples_seen_")
--> 766         X = self._validate_data(X, accept_sparse=('csr', 'csc'),
767                                 estimator=self, dtype=FLOAT_DTYPES,
768                                 force_all_finite='allow-nan', reset=first_call)
~anaconda3libsite-packagessklearnbase.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
419             out = X
420         elif isinstance(y, str) and y == 'no_validation':
--> 421             X = check_array(X, **check_params)
422             out = X
423         else:
~anaconda3libsite-packagessklearnutilsvalidation.py in inner_f(*args, **kwargs)
61             extra_args = len(args) - len(all_args)
62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
64 
65             # extra_args > 0
~anaconda3libsite-packagessklearnutilsvalidation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
671                     array = array.astype(dtype, casting="unsafe", copy=False)
672                 else:
--> 673                     array = np.asarray(array, order=order, dtype=dtype)
674             except ComplexWarning as complex_warning:
675                 raise ValueError("Complex data not supportedn"
~anaconda3libsite-packagesnumpycore_asarray.py in asarray(a, dtype, order)
81 
82     """
---> 83     return array(a, dtype, copy=False, order=order)
84 
85 
ValueError: could not convert string to float: 'Rare'

如何克服这个问题?

Feature-engine transformer的美妙之处在于您可以直接在transformer上选择变量,因此根本不需要使用sklearn的列transformer。你可以把所有的特征引擎转换器直接放在一个管道中。

lr_pipe = make_pipeline(
(RareLabelEncoder(n_categories = 9,variables= ['brand', 'model']),
(MeanEncoder(variables=high_card_cols), 
(OrdinalEncoder(variables = cat_cols), 
etc...
(StandardScaler()), 
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)

更新:

我设法用下列方法解决了这个问题:

我在管道中添加了rarelabelencoder,而不是列转换器。这为我解决了这个问题。

lr_pipe = make_pipeline(
(RareLabelEncoder(0.002, variables = ['brand', 'model'])),
(nontree_processor),
(StandardScaler()), 
(LinearRegression())
)

ColumnTransformer并行应用其变压器,因此brand列实际上从processor中出现两次:一次使用稀有标签分组,但不以其他方式编码(抛出错误),然后再次平均编码(但使用稀有组获得不同的值)。你可以使用管道来解决这个问题:

cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
brandmodel_pipe = make_pipeline(
RareLabelEncoder(n_categories=9),
MeanEncoder(),
)
processor = make_column_transformer(
(brandmodel_pipe, ['brand', 'model']),
(MeanEncoder(), ['location']),
(OrdinalEncoder(), cat_cols),
(PowerTransformer(), ['milage_kmpl']),
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),
remainder = "passthrough"
)

最新更新