加载管道只返回列名



我使用带有自定义类的管道的joblib和tempfile制作了一个pickle,当加载序列化管道时,我只得到一个具有功能名称的数组。这是我第一次为机器学习管道开发自定义类。我不知道scikit learn是否有数据帧选择类,或者如何做类似的事情。


# Custom classes
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
class MeanEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def fit(self, X, y):
if not isinstance(X, pd.DataFrame):
if isinstance(X, pd.Series):
X = pd.DataFrame(X)
else:
raise ValueError('Not a pandas DataFrame')
if X.shape[0] != y.shape[0]:
raise ValueError('The length of X is different than the length of y')
df = pd.DataFrame(X.copy())
df['TARGET'] = y.copy()
vars_cat = {}
for col in X.select_dtypes('object').columns:
ordered_labels = df.groupby(col).agg({'TARGET':'mean'}).sort_values('TARGET', ascending=False).index
ordinal_labels = {k:i for i, k in enumerate(ordered_labels, 0)}
vars_cat[col] = ordinal_labels
df[col] = df[col].map(ordinal_labels)
self.labels_ = vars_cat
self.train_cols_ = X.columns
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
if isinstance(X, pd.Series):
X_transform = pd.DataFrame(X.copy())
else:
raise ValueError('Not a pandas DataFrame')
X_transform = pd.DataFrame(X[self.train_cols_].copy())
check_array = [col for col in self.labels_.keys() if col not in X_transform.columns and X_transform[col].dtype == object]
if len(check_array) > 0:
raise ValueError('Missing the following columns:', check_array)
for col in self.labels_.keys():
X_transform[col] = X_transform[col].map(self.labels_[col])
return X_transform
# --------------------------------------------------------------------
pipeline_grid = Pipeline(steps=[
('select_vars', DataFrameSelector(vars),
('encoder', MeanEncoder()),
('xgboost', xgb.XGBClassifier(random_state=SEED, n_jobs=5, verbosity=2))
])
search = RandomizedSearchCV(
estimator=pipeline_grid,
param_distributions=params_dist_grid,
n_iter=5,
cv=cv,
n_jobs=5,
scoring='roc_auc',
random_state=SEED,
verbose=3
)
pipeline_model = search.best_estimator_
s3 = boto3.resource('s3')
# Write
with tempfile.TemporaryFile() as fp:
joblib.dump(pipeline_modelo, fp)
fp.seek(0)
s3.Bucket(NM_BUCKET).put_object(Key= path + name_pipe, Body=fp.read())
# Load
s3 = boto3.client('s3')
with tempfile.TemporaryFile() as fp:
s3.download_fileobj(Fileobj=fp, Bucket=NM_BUCKET, Key=path + name_pipe)
fp.seek(0)
pipe = joblib.load(fp)

结果如下:

array(['VAR01', 'VAR02', 'VAR03', 'VAR04', 'VAR05', 'VAR06', 'VAR07', 'VAR08', 'VAR09', 'VAR10'])

我修复了从DataFrameSelector 中删除BaseEstimator的问题

现在看起来像

class DataFrameSelector(TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]

最新更新