## Load the data ##
train=pd.read_csv("../kagglehouse/train.csv")
test=pd.read_csv("../kagglehouse/test.csv")
all_data=pd.concat((train.loc[:,"MSSubClass":"SaleCondition"],test.loc[:,"MSSubClass":"SaleCondition"]))
NFOLDS = 5
SEED = 0
NROWS = None
ntrain = train.shape[0]
ntest = test.shape[0]
#creating matrices for sklearn 1:
y_train=train["SalePrice"]
x_train = np.array(all_data[:train.shape[0]])
x_test = np.array(all_data[train.shape[0]:])
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)
class SklearnWrapper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(train_df_munged, label_df)
#self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def get_oof(clf):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
for i, (train_index, test_index) in enumerate(kf):
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.train(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te)
oof_test_skf[i, :] = clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
et_params = {
'n_jobs': 16,
}
rf_params = {
'n_jobs': 16,
}
xgb_params = {
'seed': 0,
'colsample_bytree': 0.7,
'silent': 1,
'subsample': 0.7,
}
rd_params={
'alpha': 10
}
ls_params={
'alpha': 0.005
}
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)
,看起来这个
ValueError Trackback(最近的电话 最后的) 在 () 135 136 XG_OOF_TRAIN,XG_OOF_TEST = GET_OOF(XG) -> 137 et_oof_train,et_oof_test = get_oof(et) 138 RF_OOF_TRAIN,RF_OOF_TEST = GET_OOF(RF) 139 RD_OOF_TRAIN,RD_OOF_TEST = GET_OOF(RD) 在get_oof(clf)中 77 x_te = x_train [test_index] 78 ---> 79 clf.train(x_tr,y_tr) 80 81 oof_train [test_index] = clf.predict(x_te) 在火车(self,x_train,y_train)中 46 def火车(self,x_train,y_train): 47#self.clf.fit(x_train,y_train) ---> 48 self.clf.fit(x_train,y_train) 49 50 def预测(self,x): e: graphlab anaconda2 lib site-packages sklearn elesmember forest.pyc.pyc 适合(self,x,y,sample_weight) 245#验证或转换输入数据 246 x = check_array(x,accept_sparse =" csc",dtype = dtype) -> 247 y = check_array(y,accept_sparse ='csc',huse_2d = false,dtype = none) 248如果sample_weight不是没有: 249 sample_weight = check_array(sample_weight,suse_2d = false) E: graphlab anaconda2 lib site-packages sklearn utils utils varyation.pyc 在check_array(数组,accept_sparse,dtype,订单,复制, force_all_finite,unay_2d,laster_nd,suse_min_samples, suse_min_features,warn_on_dtype,估算器) 420%(array.ndim,estemator_name)) 421如果force_all_finite: -> 422 _ASSERT_ALL_FINITE(数组) 423 424 shape_repr = _shape_repr(array.hape) E: graphlab anaconda2 lib site-packages sklearn utils utils varyation.pyc 在_ASSERT_ALL_FINITE(X)中 41而不是np.Iffinite(x).all()): 42提高价值Ertror("输入包含NAN,无穷大" ---> 43英寸或一个值太大而对于%r。"%x.dtype) 44 45 ValueError:输入包含NAN,Infinity或对于Dtype('Float64')而言太大的值。
并且当我使用np.isnan(all_data.all())
时,它返回false和 np.isfinite(all_data.all())
,它返回true,所以我感到困惑。我为什么会遇到此错误?
您没有正确检查all_data
:
np.isnan( all_data.all() )
np.isfinite( all_data.all() )
是不是您应该如何检查数据。
您将np.isnan()
和np.isfinite()
应用于all_data.all()
的输出,该输出是始终 boolean value True
/False
,因此它是始终 有限的和非nan
。p>您应该检查您的数据:
np.isfinite( all_data ).all()
np.isnan( all_data ).all()
请注意,all()
应用于np.isfinite()
和np.isnan()
的输出,而不是相反。