from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import RareLabelEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import BoxCoxTransformer, PowerTransformer
high_card_cols = ['brand', 'model', 'location']
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
processor = make_column_transformer(
(RareLabelEncoder(n_categories = 9), ['brand', 'model']), #to group rare cateogircal observations
(MeanEncoder(), high_card_cols), # To encode categorical observations with target mean
(OrdinalEncoder(), cat_cols), #to encode low cardinal variables
(PowerTransformer(), ['milage_kmpl']), # transform continuous variables using Exponential transformation
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),# transform continuous variables using BoxCox
remainder = "passthrough"
)
我目前正在执行一个回归任务。我有两个分类列,它们具有高基数和罕见的观察值。我创建了一个管道,其中包括rarelabelencoder,然后是meanencoder和其他编码器。
当我尝试拟合一个简单的线性回归模型时,我得到以下错误:
ValueError: could not convert string to float: 'Rare'
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
lr_pipe = make_pipeline(
(processor),
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-91-1c31eaf7c59a> in <module>
8 )
9
---> 10 lr_pipe.fit(X_train, y_train.price)
~anaconda3libsite-packagessklearnpipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~anaconda3libsite-packagessklearnpipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~anaconda3libsite-packagesjoblibmemory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~anaconda3libsite-packagessklearnpipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~anaconda3libsite-packagessklearnbase.py in fit_transform(self, X, y, **fit_params)
700 else:
701 # fit method of arity 2 (supervised transformation)
--> 702 return self.fit(X, y, **fit_params).transform(X)
703
704
~anaconda3libsite-packagessklearnpreprocessing_data.py in fit(self, X, y, sample_weight)
728 # Reset internal state before fitting
729 self._reset()
--> 730 return self.partial_fit(X, y, sample_weight)
731
732 def partial_fit(self, X, y=None, sample_weight=None):
~anaconda3libsite-packagessklearnpreprocessing_data.py in partial_fit(self, X, y, sample_weight)
764 """
765 first_call = not hasattr(self, "n_samples_seen_")
--> 766 X = self._validate_data(X, accept_sparse=('csr', 'csc'),
767 estimator=self, dtype=FLOAT_DTYPES,
768 force_all_finite='allow-nan', reset=first_call)
~anaconda3libsite-packagessklearnbase.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
419 out = X
420 elif isinstance(y, str) and y == 'no_validation':
--> 421 X = check_array(X, **check_params)
422 out = X
423 else:
~anaconda3libsite-packagessklearnutilsvalidation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~anaconda3libsite-packagessklearnutilsvalidation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
671 array = array.astype(dtype, casting="unsafe", copy=False)
672 else:
--> 673 array = np.asarray(array, order=order, dtype=dtype)
674 except ComplexWarning as complex_warning:
675 raise ValueError("Complex data not supportedn"
~anaconda3libsite-packagesnumpycore_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: 'Rare'
如何克服这个问题?
Feature-engine transformer的美妙之处在于您可以直接在transformer上选择变量,因此根本不需要使用sklearn的列transformer。你可以把所有的特征引擎转换器直接放在一个管道中。
lr_pipe = make_pipeline(
(RareLabelEncoder(n_categories = 9,variables= ['brand', 'model']),
(MeanEncoder(variables=high_card_cols),
(OrdinalEncoder(variables = cat_cols),
etc...
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
更新:
我设法用下列方法解决了这个问题:
我在管道中添加了rarelabelencoder,而不是列转换器。这为我解决了这个问题。
lr_pipe = make_pipeline(
(RareLabelEncoder(0.002, variables = ['brand', 'model'])),
(nontree_processor),
(StandardScaler()),
(LinearRegression())
)
ColumnTransformer
并行应用其变压器,因此brand
列实际上从processor
中出现两次:一次使用稀有标签分组,但不以其他方式编码(抛出错误),然后再次平均编码(但使用稀有组获得不同的值)。你可以使用管道来解决这个问题:
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
brandmodel_pipe = make_pipeline(
RareLabelEncoder(n_categories=9),
MeanEncoder(),
)
processor = make_column_transformer(
(brandmodel_pipe, ['brand', 'model']),
(MeanEncoder(), ['location']),
(OrdinalEncoder(), cat_cols),
(PowerTransformer(), ['milage_kmpl']),
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),
remainder = "passthrough"
)