将FeatureUnion输出转换为DictVectorizer的Dictionary



我正试图从数据营教程中重新创建一个管道,但我的管道遇到了问题。我相信我遇到的问题是将FeatureUnion的输出转换为DictVectorizer的字典。当我运行下面的代码时,代码失败了,因为没有定义BaseEstimator和TransformerMixin。任何关于我哪里出错的指导都将不胜感激。

import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn_pandas import CategoricalImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb
from sklearn.model_selection import cross_val_score



kidney_feature_names = ['age',
'bp',
'sg',
'al',
'su',
'rbc',
'pc',
'pcc',
'ba',
'bgr',
'bu',
'sc',
'sod',
'pot',
'hemo',
'pcv',
'wc',
'rc',
'htn',
'dm',
'cad',
'appet',
'pe',
'ane',
'class']
kidney_data = pd.read_csv("https://assets.datacamp.com/production/repositories/943/datasets/82c231cd41f92325cf33b78aaa360824e6b599b9/chronic_kidney_disease.csv",
names=kidney_feature_names,
index_col=False,
na_values=["?"])
kidney_data['pcv'] = pd.to_numeric(kidney_data['pcv'], errors='coerce')
kidney_data['wc'] = pd.to_numeric(kidney_data['wc'], errors='coerce')
kidney_data['rc'] = pd.to_numeric(kidney_data['rc'], errors='coerce')
print(kidney_data.dtypes)
#Split data between data and labels
X, y = kidney_data.iloc[:,:-1], kidney_data.iloc[:, -1]
# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)
# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object
print(categorical_feature_mask)
# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()
print(categorical_columns)
# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()
print(non_categorical_columns)

# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
[([numeric_feature], SimpleImputer(strategy="median")) for 
numeric_feature in non_categorical_columns],
input_df=True,
df_out=True
)
# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
[(category_feature, CategoricalImputer()) for 
category_feature in categorical_columns],
input_df=True,
df_out=True
)
# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
("num_mapper", numeric_imputation_mapper),
("cat_mapper", categorical_imputation_mapper)
])

# Custom transformer to convert Pandas DataFrame into Dict (needed for DictVectorizer)
class Dictifier(BaseEstimator, TransformerMixin):       
def fit(self, X, y=None):
return self
def transform(self, X):
return X.to_dict('records')
# Create full pipeline
pipeline = Pipeline([
("featureunion", numeric_categorical_union),
('dictifier', Dictifier())
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier(max_depth=3))
])

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)

简单的答案是,您需要从sklearn:导入这些

from sklearn.base import BaseEstimator, TransformerMixin

我也试图复制它,并遇到了一些其他问题,但我在这个答案中找到了解决方案:管道中的Sklearn_pandas返回TypeError

以下是我的完整代码:

# Import modules
import pandas as pd
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
# Create list of column names for kidney data: kidney_cols
kidney_cols = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm',
'cad', 'appet', 'pe', 'ane', 'label']
# Load dataset: df_kidney
df_kidney = pd.read_csv('chronic_kidney_disease.csv', names=kidney_cols,
na_values='?')
# Replace label values with 0 (ckd) and 1
df_kidney['label'].replace({'ckd':0, 'notckd':1}, inplace=True)
# Define X and y: X, y
X, y = df_kidney.iloc[:, :-1], df_kidney['label'].values
# Define new column order for X: col_order
col_order = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot',
'hemo', 'pcv', 'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm',
'cad', 'appet', 'pe', 'ane']
# Rearrange columns of X
X = X[col_order]
# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object
# Get a list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()
# Get a list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()
# Create empty list to hold column imputers: transformers
transformers = []
# Create numeric imputers and add to list of transformers
transformers.extend([([num_col], [Imputer(strategy='median'),
StandardScaler()]) for num_col
in non_categorical_columns])
# Create categorical imputers and add to list of transformers
transformers.extend([(cat_col, [CategoricalImputer()]) for cat_col in
categorical_columns])
# Use list of transformers to create a DataFrameMapper object
numeric_categorical_union = DataFrameMapper(transformers, input_df=True,
df_out=True)
# Define Dictifier class to turn df into dictionary as part of pipeline
class Dictifier(BaseEstimator, TransformerMixin):       
def fit(self, X, y=None):
return self
def transform(self, X):
return X.to_dict('records')
# Create full pipeline
pipeline = Pipeline([('featureunion', numeric_categorical_union),
('dictifier', Dictifier()),
('vectorizer', DictVectorizer(sort=False)),
('clf', xgb.XGBClassifier(max_depth=3))])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=3)

最新更新