我为嵌套的交叉验证循环编写了以下片段,但我很困惑如何将sequentialFeatureSelector合并到混合中,因为它有自己的CV语句。我想我需要做一些类似于";空格=dict((";在里面https://machinelearningmastery.com/nested-cross-validation-for-machine-learning-with-python/或者更好的是,我该如何将其与nested_cv 一起使用
# configure the cross-validation procedure
outer_k = 10
inner_k = 10
random_st = sample(list(np.arange(0,10,1)),1)[0]
#print(random_st)
cv_inner = KFold(n_splits=inner_k, shuffle=True, random_state=random_st)
cv_outer = KFold(n_splits=outer_k, shuffle=True, random_state=random_st+1)
outer_results = []
for outer_train_ix, outer_test_ix in cv_outer.split(X.index):
inner_results = []
for inner_train_ix, inner_test_ix in cv_outer.split(outer_train_ix):
print("inner_train_ix", inner_train_ix)
print("inner_test_ix",inner_test_ix)
#inner_results.append(inner_errors)
#best_model parms selected from the loop above
#best_model fitted to outer_train_ix, and out of sample errors are derived from outer_test_ix
print("outer_train_ix",outer_train_ix)
print("outer_test_ix",outer_test_ix)
#outer_results.append(outer_errors)
#model that performed best on the outer (out of sample) forecasts is selected
我相信我已经想通了。我误解了模型是SFS,不能将其特征与参数混淆。嵌套CV将用于相互比较模型,然后选择最佳模型。因此,由于只有一个模型。我只是简单地推导它的嵌套CV错误分数。
https://gist.github.com/thistleknot/3a46e8a9cba8067ea7061828dbe31e8d
data = pd.read_csv('C:\Users\User\Documents\wiki\wiki\dev\python\Python-Stock\data\raw\states.csv')
independent = 'Poverty'
y = data[[independent]]
X = data.loc[:, ~data.columns.isin([independent,'States'])].copy()
# configure the cross-validation procedure
outer_k = 10
inner_k = 10
random_st = sample(list(np.arange(0,10,1)),1)[0]
#print(random_st)
cv_inner = KFold(n_splits=inner_k, shuffle=True, random_state=random_st)
cv_outer = KFold(n_splits=outer_k, shuffle=True, random_state=random_st+1)
outer_results = []
client = Client('192.168.3.100:8786',timeout=3)
for outer_train_ix, outer_test_ix in cv_outer.split(X.index):
inner_results = []
for inner_train_ix, inner_test_ix in cv_inner.split(outer_train_ix):
#print("inner_train_ix",inner_train_ix)
#print("inner_test_ix",inner_test_ix)
X_train_inner = X.iloc[inner_train_ix]
X_test_inner = X.iloc[inner_test_ix]
y_train_inner = y.iloc[inner_train_ix]
y_test_inner = y.iloc[inner_test_ix]
"""
# define search space
space = dict()
space['n_estimators'] = [10, 100, 500]
space['max_features'] = [2, 4, 6]
"""
lm = LinearRegression()
sfs1 = SFS(estimator=lm,
k_features=(1, len(X.columns)),
forward=True,
floating=False,
n_jobs=-1,
scoring='neg_mean_squared_error',
cv=cv_inner)
with joblib.parallel_backend('dask'):
sfs1.fit(X_train_inner, y_train_inner)
#plt.plot(pd.DataFrame(sfs1.get_metric_dict()).T['avg_score'])
#plt.show()
best_features = pd.DataFrame(sfs1.get_metric_dict()).T['feature_names'].iloc[np.argmin(abs(pd.DataFrame(sfs1.get_metric_dict()).T['avg_score']))]
#print(len(best_features),best_features)
best_score = pd.DataFrame(sfs1.get_metric_dict()).T['avg_score'].iloc[np.argmin(abs(pd.DataFrame(sfs1.get_metric_dict()).T['avg_score']))]
#print("best_score", best_score)
inner_results.append([best_features,best_score])
best_subset = pd.DataFrame(inner_results).iloc[:,0][np.argmin(pd.DataFrame(inner_results).iloc[:,1])]
outer_model = LinearRegression()
outer_model.fit(X.iloc[outer_train_ix], y.iloc[outer_train_ix])
y_pred = outer_model.predict(X.iloc[outer_test_ix])
outer_results.append(np.mean(abs(y_pred.ravel()-y.iloc[outer_test_ix].values.ravel())))
#print("outer_train_ix",outer_train_ix)
print("outer_test_ix",outer_test_ix)
#model that performed best on the outer (out of sample) forecasts is selected 2
print()
print("outer cv abs error mean: ",np.mean(outer_results))
print("outer cv abs error std: ",np.std(outer_results))
print("Final Model")
final_model = LinearRegression()
sfs2 = SFS(estimator=lm,
k_features=(1, len(X.columns)),
forward=True,
floating=False,
n_jobs=-1,
scoring='neg_mean_squared_error',
cv=cv_outer)
with joblib.parallel_backend('dask'):
sfs2.fit(X,y)
plt.plot(pd.DataFrame(sfs1.get_metric_dict()).T['avg_score'])
plt.show()
best_features = pd.DataFrame(sfs2.get_metric_dict()).T['feature_names'].iloc[np.argmin(abs(pd.DataFrame(sfs2.get_metric_dict()).T['avg_score']))]
print(len(best_features),best_features)
best_score = pd.DataFrame(sfs2.get_metric_dict()).T['avg_score'].iloc[np.argmin(abs(pd.DataFrame(sfs2.get_metric_dict()).T['avg_score']))]
print("best_score", best_score)
import statsmodels.api as sm
#define response variable
#add constant to predictor variables
x = sm.add_constant(X[np.array(best_features)])
#fit linear regression model
model = sm.OLS(y, x).fit()
#view model summary
print(model.summary())