在自定义转换器中使用.loc会生成带有切片错误的副本



EDIT:问题保持不变,但代码已更改。

我正在Kaggle上的住房信贷数据集上工作,特别是在instruction_payment.csv上。以下是我的自定义变压器

class Xfrmer_replace1(BaseEstimator, TransformerMixin):
"""
this transformer does the global repplace within the dataframe
replace 365243 spcific to this case study with 0
replace +/-inf , nan with zero
"""
# constructor
def __init__(self):
#we are not going to use this 
self._features = None

#Return self 
def fit(self, X,y=None  ):
return self

def transform(self,X,y=None):        
#replace high values with zero
for col in X.columns:
X=X.replace([365243,365243.0],0)
print('replaced values')
#X=X.replace([np.inf,-np.inf],np.nan)
#X=X.replace(np.nan,0)    

return X
class Xfrmer_signchng1(BaseEstimator, TransformerMixin):
"""
this transformer does the change for positive to negative 
"""
# constructor
def __init__(self):
#we are not going to use this         
self.signchng_columns = None

#Return self 
def fit(self,X,y=None  ):
return self

def transform(self,X,y=None):        
#change the sign of the columns
for col in X.columns:
print('sign change')
X[col]= [0  if val >= 0 else (val *-1) for val in X[col] ]

return X  
class Xfrmer_dif_calc1(BaseEstimator, TransformerMixin):
"""
this transformer does the difference bewteen the two columns
the i/p is a list of tuples
the second item in the tuple is divided from the first item
the third item in the tuple is the name of this new column    
"""
# constructor
def __init__(self):
#we are not going to use this         
self.dif_columns = None

#Return self 
def fit(self,X,y=None):
return self

def transform(self,X,y=None):
print('diff caclulator')
print('X columns', X.columns)
print(X[X.columns[0]])
print(X[X.columns[1]])
#print(X['AMT_PAYMENT'])
#print(X['AMT_INSTALMENT'])
#print(X[X.columns[0]] - X[X.columns[1]])
#iter1.X.loc[:,'AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
X['AMT_PMT_DIF'] = X[X.columns[0]] - X[X.columns[1]]
print(X['AMT_PMT_DIF'])
return X   

class Xfrmer_rto_calc1(BaseEstimator, TransformerMixin):
"""
this transformer calculates the ratio between two columns
the i/p is a list of tuples
the first item in the tuple is divided from the second item
the third item in the tuple is the name of this new column
"""
# constructor
def __init__(self):
#we are not going to use this 
self.rto_columns = None

#Return self 
def fit(self,X,y=None):
return self

def transform(self,X,y=None):        
print('ratio caclulator')
#iter1.X.loc[:,'AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)
X['AMT_PMT_RTO'] = (X[X.columns [0]] / X[X.columns [1]]).clip(lower=0)

return X  

这就是我消耗管道的方式

lst_all_cols = dtprcs.X_train.columns.values.tolist()
lst_signchng_cols = ["DAYS_INSTALMENT","DAYS_ENTRY_PAYMENT"]            
lst_imptr_cols=['DAYS_ENTRY_PAYMENT','AMT_PAYMENT']
lst_diff_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"]            
lst_rto_cols = ['AMT_PAYMENT',"AMT_INSTALMENT"] 
print('Starting pipeline processing')        
#"""
instpmt_preprcs_pipln = ColumnTransformer( transformers = [
( 'instpmt_repl_pipln', Xfrmer_replace1(),lst_all_cols ),
( 'instpmt_sgnchng_pipln', Xfrmer_signchng1(),lst_signchng_cols ),
( 'instpmt_imptr_piplin',SimpleImputer(strategy = 'median'),lst_imptr_cols ),
('instpmt_dif_pipln',Xfrmer_dif_calc1(), lst_diff_cols),
('instpmt_rto_pipln',Xfrmer_rto_calc1(),lst_rto_cols)],
remainder='passthrough')
print('Pipeline fitting start...')
instpmt_preprcs_pipln.fit( dtprcs.X_train, dtprcs.y_train )
print('Pipeline fitting over...')
#print(dtprcs.X_train.shape,dtprcs.x_test.shape)
#print(dtprcs.X_train.columns,dtprcs.x_test.columns)
#Can predict with it like any other pipeline
print('Pipeline transforming x_test...')
y_pred = instpmt_partial_piplin.transform( dtprcs.x_test ) 
print('Pipeline transforming x_test over...')
print(type(dtprcs.X_train),type(dtprcs.x_test),type(dtprcs.y_train))
print(dtprcs.X_train.columns,dtprcs.x_test.columns)
print('Pipeline preprocessing pver. Seting up other classes...')

我的问题

  1. 如何在列转换器中向数据帧添加新列?我尝试使用.loc和不使用.loc。从下面的跟踪中,我们发现该值实际上正在计算中,但没有更新到数据帧中

  2. 调试值在fit((过程中打印,但在测试数据集的转换过程中不打印。

最新堆栈跟踪

Finished reading apln train/test files...
installments_payments.csv
primary name train installments_payments_train.csv
primary name test installments_payments_test.csv
Train test files ready...
finished writing train/test files.
Exiting function(0).
(16915, 8)
(4574, 8)
Processing installments_payments.csv...
Starting pipeline processing
Pipeline fitting start...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
X columns Index(['AMT_PAYMENT', 'AMT_INSTALMENT'], dtype='object')
0         6948.360
2         6948.360
3         1716.525
4         1716.525
5         3375.000
...    
42390    12303.000
42391    10299.960
42392    10869.435
42402      124.155
42409     4198.950
Name: AMT_PAYMENT, Length: 16915, dtype: float64
0         6948.360
2         6948.360
3         1716.525
4         1716.525
5         3375.000
...    
42390    12303.000
42391    10299.960
42392    14958.135
42402      124.155
42409     4198.950
Name: AMT_INSTALMENT, Length: 16915, dtype: float64
0           0.0
2           0.0
3           0.0
4           0.0
5           0.0
...  
42390       0.0
42391       0.0
42392   -4088.7
42402       0.0
42409       0.0
Name: AMT_PMT_DIF, Length: 16915, dtype: float64
ratio caclulator
Pipeline fitting over...
Pipeline transforming x_test...
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
replaced values
sign change
sign change
diff caclulator
ratio caclulator
**Pipeline transforming x_test over...**
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
'AMT_INSTALMENT', 'AMT_PAYMENT'],
dtype='object') Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
'AMT_INSTALMENT', 'AMT_PAYMENT'],
dtype='object')
Pipeline preprocessing pver. Seting up other classes...
Exiting main function...
E:anacondaenvsappliedaicourselibsite-packagesipykernel_launcher.py:187: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
E:anacondaenvsappliedaicourselibsite-packagespandascoreindexing.py:362: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[key] = _infer_fill_value(value)
E:anacondaenvsappliedaicourselibsite-packagespandascoreindexing.py:562: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[item_labels[indexer[info_axis]]] = value

正如我在评论中所说,我首先使用提取我需要学习的特性(.fit(

from sklearn.base import TransformerMixin
class FeatureExtractor(TransformerMixin):
def __init__(self, cols):
self.cols = cols
print(self.cols)

def fit(self, X, y=None):
# stateless transformer
return self

def transform(self, X):
# assumes X is Pandas Dataframe
X_cols = X.loc[:, self.cols]
return X_cols

然后使用这个类从数据中的一列中学习:

class SynopsisNumWords(TransformerMixin):
def __init__(self):
return None
# self.text_array = text_array

def fit(self,  X, y=None, **fit_params):
return self

def transform(self, X, y=None, **fit_params):
X = X.copy()
# # rename the series to not have the same column name as input
return X.loc[:,'Synopsis'].apply(lambda x: len(str(x).split())).rename('Synopsis_num_words').to_frame()

然后使用以下方法将所有功能合并为一个数据帧:

class DFFeatureUnion(TransformerMixin):
# FeatureUnion but for pandas DataFrames
def __init__(self, transformer_list):
self.transformer_list = transformer_list
def fit(self, X, y=None):
for (name, t) in self.transformer_list:
t.fit(X)
return self
def transform(self, X):
# X must be a DataFrame
Xts = [t.transform(X) for _, t in self.transformer_list]
Xunion = reduce(lambda X1, X2: pd.merge(X1, X2, left_index=True, right_index=True), Xts)
return Xunion

然后将所有这些结合起来,形成一个如下图所示的管道。这个管道获取一个9列的数据帧,从一列中学习,从中生成另一列,然后将所有列合并,并返回10列的数据框架。

from sklearn.pipeline import Pipeline
synopsis_feat_gen_pipeline = Pipeline(steps=[('engineer_data',
DFFeatureUnion([
('extract_all_columns',
Pipeline(steps=[
('extract_all_features',
 FeatureExtractor(['Synopsis', 'Title', 'Author', 'Edition',
                   'Reviews', 'Ratings', 'Genre', 'BookCategory', 'Price'])
 )
], verbose=True
)
),
('generate_num_words_column',
Pipeline(steps=[
('extract_Synopsis_feature', FeatureExtractor(['Synopsis'])),
('generate_num_words', SynopsisNumWords())
], verbose=True
)
),
]))
],
verbose=True)

最新更新