EDIT:问题保持不变,但代码已更改。
我正在Kaggle上的住房信贷数据集上工作,特别是在instruction_payment.csv上。以下是我的自定义变压器
class Xfrmer_replace1(BaseEstimator, TransformerMixin):
"""
this transformer does the global repplace within the dataframe
replace 365243 spcific to this case study with 0
replace +/-inf , nan with zero
"""
# constructor
def __init__(self):
#we are not going to use this
self._features = None
#Return self
def fit(self, X,y=None ):
return self
def transform(self,X,y=None):
#replace high values with zero
for col in X.columns:
X=X.replace([365243,365243.0],0)
print('replaced values')
#X=X.replace([np.inf,-np.inf],np.nan)
#X=X.replace(np.nan,0)
return X
class Xfrmer_signchng1(BaseEstimator, TransformerMixin):
"""
this transformer does the change for positive to negative
"""
# constructor
def __init__(self):
#we are not going to use this
self.signchng_columns = None
#Return self
def fit(self,X,y=None ):
return self
def transform(self,X,y=None):
#change the sign of the columns
for col in X.columns:
print('sign change')
X[col]= [0 if val >= 0 else (val *-1) for val in X[col] ]
return X
class Xfrmer_dif_calc1(BaseEstimator, TransformerMixin):
"""
this transformer does the difference bewteen the two columns
the i/p is a list of tuples
the second item in the tuple is divided from the first item
the third item in the tuple is the name of this new column
"""
# constructor
def __init__(self):
#we are not going to use this
self.dif_columns = None
#Return self
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
print('diff caclulator')
print('X columns', X.columns)
print(X[X.columns[0]])
print(X[X.columns[1]])
#print(X['AMT_PAYMENT'])
#print(X['AMT_INSTALMENT'])
#print(X[X.columns[0]] - X[X.columns[1]])
#iter1.X.loc[:,'AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
X['AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
print(X['AMT_PMT_DIF'])
return X
class Xfrmer_rto_calc1(BaseEstimator, TransformerMixin):
"""
this transformer calculates the ratio between two columns
the i/p is a list of tuples
the first item in the tuple is divided from the second item
the third item in the tuple is the name of this new column
"""
# constructor
def __init__(self):
#we are not going to use this
self.rto_columns = None
#Return self
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
print('ratio caclulator')
#iter1.X.loc[:,'AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)
X['AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)
return X
这就是我消耗管道的方式
lst_all_cols = dtprcs.X_train.columns.values.tolist()
lst_signchng_cols = ["DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"]
lst_imptr_cols=['DAYS_ENTRY_PAYMENT','AMT_PAYMENT']
lst_diff_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"]
lst_rto_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"]
print('Starting pipeline processing')
#"""
instpmt_preprcs_pipln = ColumnTransformer( transformers = [
( 'instpmt_repl_pipln', Xfrmer_replace1(),lst_all_cols ),
( 'instpmt_sgnchng_pipln', Xfrmer_signchng1(),lst_signchng_cols ),
( 'instpmt_imptr_piplin',SimpleImputer(strategy = 'median'),lst_imptr_cols ),
('instpmt_dif_pipln',Xfrmer_dif_calc1(), lst_diff_cols),
('instpmt_rto_pipln',Xfrmer_rto_calc1(),lst_rto_cols)],
remainder='passthrough')
print('Pipeline fitting start...')
instpmt_preprcs_pipln.fit( dtprcs.X_train, dtprcs.y_train )
print('Pipeline fitting over...')
#print(dtprcs.X_train.shape,dtprcs.x_test.shape)
#print(dtprcs.X_train.columns,dtprcs.x_test.columns)
#Can predict with it like any other pipeline
print('Pipeline transforming x_test...')
y_pred = instpmt_partial_piplin.transform( dtprcs.x_test )
print('Pipeline transforming x_test over...')
print(type(dtprcs.X_train),type(dtprcs.x_test),type(dtprcs.y_train))
print(dtprcs.X_train.columns,dtprcs.x_test.columns)
print('Pipeline preprocessing pver. Seting up other classes...')
我的问题
如何在列转换器中向数据帧添加新列?我尝试使用.loc和不使用.loc。从下面的跟踪中,我们发现该值实际上正在计算中,但没有更新到数据帧中
调试值在fit((过程中打印,但在测试数据集的转换过程中不打印。
最新堆栈跟踪
Finished reading apln train/test files...
installments_payments.csv
primary name train installments_payments_train.csv
primary name test installments_payments_test.csv
Train test files ready...
finished writing train/test files.
Exiting function(0).
(16915, 8)
(4574, 8)
Processing installments_payments.csv...
Starting pipeline processing
Pipeline fitting start...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
X columns Index(['AMT_PAYMENT', 'AMT_INSTALMENT'], dtype='object')
0 6948.360
2 6948.360
3 1716.525
4 1716.525
5 3375.000
...
42390 12303.000
42391 10299.960
42392 10869.435
42402 124.155
42409 4198.950
Name: AMT_PAYMENT, Length: 16915, dtype: float64
0 6948.360
2 6948.360
3 1716.525
4 1716.525
5 3375.000
...
42390 12303.000
42391 10299.960
42392 14958.135
42402 124.155
42409 4198.950
Name: AMT_INSTALMENT, Length: 16915, dtype: float64
0 0.0
2 0.0
3 0.0
4 0.0
5 0.0
...
42390 0.0
42391 0.0
42392 -4088.7
42402 0.0
42409 0.0
Name: AMT_PMT_DIF, Length: 16915, dtype: float64
ratio caclulator
Pipeline fitting over...
Pipeline transforming x_test...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
ratio caclulator
**Pipeline transforming x_test over...**
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
'AMT_INSTALMENT', 'AMT_PAYMENT'],
dtype='object') Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
'AMT_INSTALMENT', 'AMT_PAYMENT'],
dtype='object')
Pipeline preprocessing pver. Seting up other classes...
Exiting main function...
E:anacondaenvsappliedaicourselibsite-packagesipykernel_launcher.py:187: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
E:anacondaenvsappliedaicourselibsite-packagespandascoreindexing.py:362: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[key] = _infer_fill_value(value)
E:anacondaenvsappliedaicourselibsite-packagespandascoreindexing.py:562: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[item_labels[indexer[info_axis]]] = value
正如我在评论中所说,我首先使用提取我需要学习的特性(.fit
(
from sklearn.base import TransformerMixin
class FeatureExtractor(TransformerMixin):
def __init__(self, cols):
self.cols = cols
print(self.cols)
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
# assumes X is Pandas Dataframe
X_cols = X.loc[:, self.cols]
return X_cols
然后使用这个类从数据中的一列中学习:
class SynopsisNumWords(TransformerMixin):
def __init__(self):
return None
# self.text_array = text_array
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
X = X.copy()
# # rename the series to not have the same column name as input
return X.loc[:,'Synopsis'].apply(lambda x: len(str(x).split())).rename('Synopsis_num_words').to_frame()
然后使用以下方法将所有功能合并为一个数据帧:
class DFFeatureUnion(TransformerMixin):
# FeatureUnion but for pandas DataFrames
def __init__(self, transformer_list):
self.transformer_list = transformer_list
def fit(self, X, y=None):
for (name, t) in self.transformer_list:
t.fit(X)
return self
def transform(self, X):
# X must be a DataFrame
Xts = [t.transform(X) for _, t in self.transformer_list]
Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
return Xunion
然后将所有这些结合起来,形成一个如下图所示的管道。这个管道获取一个9列的数据帧,从一列中学习,从中生成另一列,然后将所有列合并,并返回10列的数据框架。
from sklearn.pipeline import Pipeline
synopsis_feat_gen_pipeline = Pipeline(steps=[('engineer_data',
DFFeatureUnion([
('extract_all_columns',
Pipeline(steps=[
('extract_all_features',
FeatureExtractor(['Synopsis', 'Title', 'Author', 'Edition',
'Reviews', 'Ratings', 'Genre', 'BookCategory', 'Price'])
)
], verbose=True
)
),
('generate_num_words_column',
Pipeline(steps=[
('extract_Synopsis_feature', FeatureExtractor(['Synopsis'])),
('generate_num_words', SynopsisNumWords())
], verbose=True
)
),
]))
],
verbose=True)