
我目前正在研究法国汽车索赔数据集freMTPL2freq"Kaggle大赛(https://www.kaggle.com/floser/french-motor-claims-datasets-fremtpl2freq)。不幸的是,我得到一个NotFittedError: All estimators failed to fit;错误,每当我使用RandomizedSearchCV,我不知道为什么。如有任何帮助,不胜感激。

import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
data_freq = pd.read_csv('freMTPL2freq.csv')
data_freq['Area'] = data_freq['Area'].str.replace(''','')
data_freq['VehBrand'] = data_freq['VehBrand'].str.replace(''','')
data_freq['VehGas'] = data_freq['VehGas'].str.replace(''','')
data_freq['Region'] = data_freq['Region'].str.replace(''','')
data_freq['frequency'] = data_freq['ClaimNb'] / data_freq['Exposure']
y = data_freq['frequency']
X = data_freq.drop(['frequency', 'ClaimNb', 'IDpol'], axis = 1)
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, shuffle = True, random_state = 42)
pt_columns = ['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']
cat_columns = ['Area', 'Region', 'VehBrand', 'VehGas']
from xgboost import XGBRegressor
ct = ColumnTransformer([('pt', 'passthrough', pt_columns),
('ohe', OneHotEncoder(), cat_columns)])
pipe_xgbr = Pipeline([('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())
param = {'xgb_regressor__n_estimators':[3, 5],
'xgb_regressor__max_depth':[3, 5, 7],
'xgb_regressor__learning_rate':[0.1, 0.5],
'xgb_regressor__colsample_bytree':[0.5, 0.8],
'xgb_regressor__subsample':[0.5, 0.8]
rscv = RandomizedSearchCV(pipe_xgbr, param_distributions = param, n_iter = 2, scoring = mean_squared_error, n_jobs = -1, cv = 5, error_score = 'raise')
rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])


IDpol    ClaimNb Exposure    Area    VehPower    VehAge  DrivAge BonusMalus  VehBrand    VehGas  Density Region
0   1.0        1        0.10       D           5         0       55        50        B12    Regular 1217    R82
1   3.0        1        0.77       D           5         0       55        50        B12    Regular 1217    R82
2   5.0        1        0.75       B           6         2       52        50        B12    Diesel  54      R22
3   10.0       1        0.09       B           7         0       46        50        B12    Diesel  76      R72
4   11.0       1        0.84       B           7         0       46        50        B12    Diesel  76      R72
当设置verbose = 10和n_jobs = 1时,出现以下错误消息:

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START xgb_regressor__colsample_bytree=0.5, xgb_regressor__learning_rate=0.5, xgb_regressor__max_depth=5, xgb_regressor__n_estimators=5, xgb_regressor__subsample=0.5
C:UsersJananaconda3libsite-packagessklearnutilsvalidation.py:72: FutureWarning: Pass sample_weight=406477    1.0
393150    0.0
252885    0.0
260652    0.0
661256    0.0
154663    0.0
398414    0.0
42890     0.0
640774    0.0
114446    0.0
Name: frequency, Length: 108482, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
"will result in an error", FutureWarning)
TypeError                                 Traceback (most recent call last)
<ipython-input-84-74435f74c470> in <module>
----> 1 rscv.fit(X_train, y_train, xgb_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~anaconda3libsite-packagessklearnutilsvalidation.py in inner_f(*args, **kwargs)
61             extra_args = len(args) - len(all_args)
62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
65             # extra_args > 0
~anaconda3libsite-packagessklearnmodel_selection_search.py in fit(self, X, y, groups, **fit_params)
839                 return results
--> 841             self._run_search(evaluate_candidates)
843             # multimetric is determined here because in the case of a callable
~anaconda3libsite-packagessklearnmodel_selection_search.py in _run_search(self, evaluate_candidates)
1633         evaluate_candidates(ParameterSampler(
1634             self.param_distributions, self.n_iter,
-> 1635             random_state=self.random_state))
~anaconda3libsite-packagessklearnmodel_selection_search.py in evaluate_candidates(candidate_params, cv, more_results)
807                                    (split_idx, (train, test)) in product(
808                                    enumerate(candidate_params),
--> 809                                    enumerate(cv.split(X, y, groups))))
811                 if len(out) < 1:
~anaconda3libsite-packagesjoblibparallel.py in __call__(self, iterable)
1002             # remaining jobs.
1003             self._iterating = False
-> 1004             if self.dispatch_one_batch(iterator):
1005                 self._iterating = self._original_iterator is not None
~anaconda3libsite-packagesjoblibparallel.py in dispatch_one_batch(self, iterator)
833                 return False
834             else:
--> 835                 self._dispatch(tasks)
836                 return True
~anaconda3libsite-packagesjoblibparallel.py in _dispatch(self, batch)
752         with self._lock:
753             job_idx = len(self._jobs)
--> 754             job = self._backend.apply_async(batch, callback=cb)
755             # A job can complete so quickly than its callback is
756             # called before we get here, causing self._jobs to
~anaconda3libsite-packagesjoblib_parallel_backends.py in apply_async(self, func, callback)
207     def apply_async(self, func, callback=None):
208         """Schedule a func to be run"""
--> 209         result = ImmediateResult(func)
210         if callback:
211             callback(result)
~anaconda3libsite-packagesjoblib_parallel_backends.py in __init__(self, batch)
588         # Don't delay the application, to avoid keeping the input
589         # arguments in memory
--> 590         self.results = batch()
592     def get(self):
~anaconda3libsite-packagesjoblibparallel.py in __call__(self)
254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
258     def __len__(self):
~anaconda3libsite-packagesjoblibparallel.py in <listcomp>(.0)
254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
258     def __len__(self):
~anaconda3libsite-packagessklearnutilsfixes.py in __call__(self, *args, **kwargs)
220     def __call__(self, *args, **kwargs):
221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)
~anaconda3libsite-packagessklearnmodel_selection_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
624         fit_time = time.time() - start_time
--> 625         test_scores = _score(estimator, X_test, y_test, scorer, error_score)
626         score_time = time.time() - start_time - fit_time
627         if return_train_score:
~anaconda3libsite-packagessklearnmodel_selection_validation.py in _score(estimator, X_test, y_test, scorer, error_score)
685             scores = scorer(estimator, X_test)
686         else:
--> 687             scores = scorer(estimator, X_test, y_test)
688     except Exception:
689         if error_score == 'raise':
~anaconda3libsite-packagessklearnutilsvalidation.py in inner_f(*args, **kwargs)
72                           "will result in an error", FutureWarning)
73             kwargs.update(zip(sig.parameters, args))
---> 74             return f(**kwargs)
75         return inner_f
~anaconda3libsite-packagessklearnmetrics_regression.py in mean_squared_error(y_true, y_pred, sample_weight, multioutput, squared)
334     """
335     y_type, y_true, y_pred, multioutput = _check_reg_targets(
--> 336         y_true, y_pred, multioutput)
337     check_consistent_length(y_true, y_pred, sample_weight)
338     output_errors = np.average((y_true - y_pred) ** 2, axis=0,
~anaconda3libsite-packagessklearnmetrics_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
86         the dtype argument passed to check_array.
87     """
---> 88     check_consistent_length(y_true, y_pred)
89     y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
90     y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~anaconda3libsite-packagessklearnutilsvalidation.py in check_consistent_length(*arrays)
314     """
--> 316     lengths = [_num_samples(X) for X in arrays if X is not None]
317     uniques = np.unique(lengths)
318     if len(uniques) > 1:
~anaconda3libsite-packagessklearnutilsvalidation.py in <listcomp>(.0)
314     """
--> 316     lengths = [_num_samples(X) for X in arrays if X is not None]
317     uniques = np.unique(lengths)
318     if len(uniques) > 1:
~anaconda3libsite-packagessklearnutilsvalidation.py in _num_samples(x)
247     if hasattr(x, 'fit') and callable(x.fit):
248         # Don't get num_samples from an ensembles length!
--> 249         raise TypeError(message)
251     if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>


度量函数mean_squared_error具有(y_true, y_pred, *, <kwargs>)特征,而使用字符串"neg_mean_squared_error"得到的计分器具有(estimator, X_test, y_test)特征。在回溯中,你看到

--> 687             scores = scorer(estimator, X_test, y_test)


根据您的错误消息,KeyError: 'xgbr_regressor'代码无法在您的管道中找到关键xgbr_regressor。在管道中,您已经定义了xgb_regressor:

pipe_xgbr = Pipeline(
[('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())])


rscv.fit(X_train, y_train, xgbr_regressor__sample_weight=X_train['Exposure'])

