我正在sci-kit learn中构建线性回归模型,并将输入作为sci-kit learn管道中的预处理步骤进行缩放。有什么方法可以避免缩放二进制列吗?正在发生的事情是,这些列与其他每列一起缩放,导致值以 0 为中心,而不是 0 或 1,因此我得到的值为 [-0.6, 0.3],这会导致输入值 0 影响线性模型中的预测。
用于说明的基本代码:
>>> import numpy as np
>>> from sklearn.pipeline import Pipeline
>>> from sklearn.preprocessing import StandardScaler
>>> from sklearn.linear_model import Ridge
>>> X = np.hstack( (np.random.random((1000, 2)),
np.random.randint(2, size=(1000, 2))) )
>>> X
array([[ 0.30314072, 0.22981496, 1. , 1. ],
[ 0.08373292, 0.66170678, 1. , 0. ],
[ 0.76279599, 0.36658793, 1. , 0. ],
...,
[ 0.81517519, 0.40227095, 0. , 0. ],
[ 0.21244587, 0.34141014, 0. , 0. ],
[ 0.2328417 , 0.14119217, 0. , 0. ]])
>>> scaler = StandardScaler()
>>> scaler.fit_transform(X)
array([[-0.67768374, -0.95108883, 1.00803226, 1.03667198],
[-1.43378124, 0.53576375, 1.00803226, -0.96462528],
[ 0.90632643, -0.48022732, 1.00803226, -0.96462528],
...,
[ 1.08682952, -0.35738315, -0.99203175, -0.96462528],
[-0.99022572, -0.56690563, -0.99203175, -0.96462528],
[-0.91994001, -1.25618613, -0.99203175, -0.96462528]])
我希望最后一行的输出是:
>>> scaler.fit_transform(X, dont_scale_binary_or_something=True)
array([[-0.67768374, -0.95108883, 1. , 1. ],
[-1.43378124, 0.53576375, 1. , 0. ],
[ 0.90632643, -0.48022732, 1. , 0. ],
...,
[ 1.08682952, -0.35738315, 0. , 0. ],
[-0.99022572, -0.56690563, 0. , 0. ],
[-0.91994001, -1.25618613, 0. , 0. ]])
有什么办法可以做到这一点吗?我想我可以只选择非二进制的列,只转换这些列,然后将转换后的值替换回数组,但我希望它能很好地与 sci-kit learn Pipeline 工作流程配合使用,所以我可以做这样的事情:
clf = Pipeline([('scaler', StandardScaler()), ('ridge', Ridge())])
clf.set_params(scaler__dont_scale_binary_features=True, ridge__alpha=0.04).fit(X, y)
您应该创建一个自定义缩放器,该缩放器在缩放时忽略最后两列。
from sklearn.base import TransformerMixin
import numpy as np
class CustomScaler(TransformerMixin):
def __init__(self):
self.scaler = StandardScaler()
def fit(self, X, y):
self.scaler.fit(X[:, :-2], y)
return self
def transform(self, X):
X_head = self.scaler.transform(X[:, :-2])
return np.concatenate(X_head, X[:, -2:], axis=1)
我正在发布我根据@miindlek的响应改编的代码,以防万一它对其他人有帮助。当我没有包含BaseEstimator时,我遇到了一个错误。再次感谢您@miindlek。下面,bin_vars_index是二进制变量的列索引数组,cont_vars_index是要缩放的连续变量的列索引数组。
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
class CustomScaler(BaseEstimator,TransformerMixin):
# note: returns the feature matrix with the binary columns ordered first
def __init__(self,bin_vars_index,cont_vars_index,copy=True,with_mean=True,with_std=True):
self.scaler = StandardScaler(copy,with_mean,with_std)
self.bin_vars_index = bin_vars_index
self.cont_vars_index = cont_vars_index
def fit(self, X, y=None):
self.scaler.fit(X[:,self.cont_vars_index], y)
return self
def transform(self, X, y=None, copy=None):
X_tail = self.scaler.transform(X[:,self.cont_vars_index],y,copy)
return np.concatenate((X[:,self.bin_vars_index],X_tail), axis=1)
管道应更改为:
from sklearn.preprocessing import StandardScaler,FunctionTransformer
from sklearn.pipeline import Pipeline,FeatureUnion
pipeline=Pipeline(steps= [
('feature_processing', FeatureUnion(transformer_list = [
('categorical', FunctionTransformer(lambda data: data[:, cat_indices])),
#numeric
('numeric', Pipeline(steps = [
('select', FunctionTransformer(lambda data: data[:, num_indices])),
('scale', StandardScaler())
]))
])),
('clf', Ridge())
]
)
我已经对@J_C代码进行了一些调整,以使用熊猫数据框。您可以传递要缩放的列名称,并获得具有初始列顺序的结果。
enter code here
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
class CustomScaler(BaseEstimator,TransformerMixin):
def __init__(self,columns,copy=True,with_mean=True,with_std=True):
self.scaler = StandardScaler(copy,with_mean,with_std)
self.columns = columns
def fit(self, X, y=None):
self.scaler.fit(X[self.columns], y)
return self
def transform(self, X, y=None, copy=None):
init_col_order = X.columns
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
X_not_scaled = X.ix[:,~X.columns.isin(self.columns)]
return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
用法:
scale = CustomScaler(columns=['duration', 'num_operations'])
scaled = scale.fit_transform(churn_d)
我发现 Grabovets 数据帧版本中的串联无法正常工作@Vitaliy除非您指定X_scaled索引。所以相关行现在为:
X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns, index=X.index)
这可能使您更容易
import pandas as pd
import numpy as np
X = np.hstack((np.random.random((1000, 2)),np.random.randint(2, size= (1000, 2))))
df=pd.DataFrame(X,columns=["num_1","num_2","binary_1","binary_2"])
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
num_pipeline = Pipeline([
('std_scaler', StandardScaler()),
])
num_attribs=["num_1","num_2"]
binary_attribs=["binary_1","binary_2"]
full_pipeline = ColumnTransformer([
("num_cols", num_pipeline, num_attribs),
("binary_cols",OneHotEncoder(drop="first"),binary_attribs),
])
full_pipeline.fit_transform(df)