XGBoost:使用DataFrame副本时,标签集不能为空



我目前正在尝试用以下代码为我的原始数据帧"data"的特定数据训练我的XGBoost模型。问题是,当我使用原始数据帧作为连接的左部分时,一切都可以很好地与我的模型配合使用。

join = pd.merge(data, data_shift, left_index=True, right_index=True, how='left', suffixes=('', '_lag'))

但是,当我使用数据帧"location_product_data"作为合并/联接的左侧部分时,拟合会引发上面提到的错误消息。

data = data_raw.copy()
data = data.drop(['Number of day', 'Number of working days'], axis=1)

uniqueLocations = data['SC'].unique().tolist()
data['DATE'] = pd.to_datetime(data['DATE'])
data = data.set_index(pd.DatetimeIndex(data['DATE']))   
year = '2015'
for location in uniqueLocations:
location_data = data.loc[data['SC'] == sc].copy()
uniqueProducts = location_data['SPK'].unique().tolist()
for product in uniqueProducts:
location_product_data = location_data.loc[location_data['SPK'] == product].copy()
#split complete dataset in training and test data
split_date = year+'-10-01'
data_shift = location_product_data.shift(1).copy()
data_shift = data_shift.drop(['SC', 'SPK'], axis=1).copy()
join = pd.merge(location_product_data, data_shift, left_index=True, right_index=True, how='left', suffixes=('', '_lag'))
join['NS AC_lag'] = join['NS AC'] - join['NS AC_lag']
data_train = join.loc[join.index <= split_date].copy()
data_test = join.loc[join.index > split_date].copy()
X_train, y_train = create_features(data_train, label='NS AC')
X_test, y_test = create_features(data_test, label='NS AC')
reg = xgb.XGBRegressor(n_estimators=1000)
params = {
'base_score': np.mean(y_train),
'eta':  0.1,
'max_depth': 3,
'gamma' :3,
'objective'   :'reg:squarederror',
'eval_metric' :'mae'
}
reg.fit(X_train, y_train, 
eval_set=[(X_train, y_train), (X_test, y_test)],
early_stopping_rounds=50,
verbose=True)

def create_features(df, label=None):
"""
Creates time series features from datetime index
"""
df['DATE'] = df.index
df['quarter'] = df['DATE'].dt.quarter
df['month'] = df['DATE'].dt.month
df['year'] = df['DATE'].dt.year
df['lag'] = df['NS AC_lag']
#df['lag2'] = df['NS AC_lag2']
#df['lag3'] = df['NS AC_lag3']
#df['min'] = df['min']
#df['mean'] = df['mean']
#df['max'] = df['max']
#df['t+1'] = df['t+1']
#df['spk'] = df['SPK']
#X = df[['month','quarter','year', 'lag', 'lag2','lag3','min', 'mean', 'max', 't+1']]
X = df[['month','quarter','year', 'lag']]
#X = df[['month','quarter','year', 'Number of day', 'Number of working days','lag', 'lag2','lag3','min', 'mean', 'max', 't+1']]
if label:
y = df[label]
return X, y
return X

你知道为什么会出现错误吗?我还找不到为什么不这样做的原因。谢谢!:(

错误:

--------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
<ipython-input-30-9232809a4751> in <module>
83                     eval_set=[(X_train, y_train), (X_test, y_test)],
84                     early_stopping_rounds=50,
---> 85                     verbose=False)
86 
87             #print('Training finished.')
/opt/conda/lib/python3.7/site-packages/xgboost/sklearn.py in fit(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, callbacks)
394                               evals_result=evals_result, obj=obj, feval=feval,
395                               verbose_eval=verbose, xgb_model=xgb_model,
--> 396                               callbacks=callbacks)
397 
398         if evals_result:
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)
214                            evals=evals,
215                            obj=obj, feval=feval,
--> 216                            xgb_model=xgb_model, callbacks=callbacks)
217 
218 
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
82         # check evaluation result.
83         if evals:
---> 84             bst_eval_set = bst.eval_set(evals, i, feval)
85             if isinstance(bst_eval_set, STRING_TYPES):
86                 msg = bst_eval_set
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in eval_set(self, evals, iteration, feval)
1170                                               dmats, evnames,
1171                                               c_bst_ulong(len(evals)),
-> 1172                                               ctypes.byref(msg)))
1173         res = msg.value.decode()
1174         if feval is not None:
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in _check_call(ret)
174     """
175     if ret != 0:
--> 176         raise XGBoostError(py_str(_LIB.XGBGetLastError()))
177 
178 
XGBoostError: [15:41:05] /workspace/src/metric/elementwise_metric.cu:325: Check failed: info.labels_.Size() != 0U (0 vs. 0) : label set cannot be empty
Stack trace:
[bt] (0) /opt/conda/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7fae2412ecb4]
[bt] (1) /opt/conda/xgboost/libxgboost.so(xgboost::metric::EvalEWiseBase<xgboost::metric::EvalRowRMSE>::Eval(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, bool)+0xfe) [0x7fae243102ee]
[bt] (2) /opt/conda/xgboost/libxgboost.so(xgboost::LearnerImpl::EvalOneIter(int, std::vector<xgboost::DMatrix*, std::allocator<xgboost::DMatrix*> > const&, std::vector<std::string, std::allocator<std::string> > const&)+0x3c9) [0x7fae241c8d99]
[bt] (3) /opt/conda/xgboost/libxgboost.so(XGBoosterEvalOneIter+0x371) [0x7fae2412b651]
[bt] (4) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fae8f98eec0]
[bt] (5) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7fae8f98e87d]
[bt] (6) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7fae8fba4f7e]
[bt] (7) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x139b4) [0x7fae8fba59b4]
[bt] (8) /opt/conda/bin/python(_PyObject_FastCallKeywords+0x49b) [0x557ed8936d2b]     

问题已解决。以下行可能返回一个空的数据帧。我解决了这个问题,现在可以进行试衣/训练了。

data_train = join.loc[join.index <= split_date].copy()
data_test = join.loc[join.index > split_date].copy()

最新更新