当我在Python的sklearn中执行Gridsearch或RandomizedSearch时,标题中的错误总是会发生的。 但是我检查了我的数据帧 X 和 y,找不到任何 nans 或 inf。当使用数据集来训练测试正常的回归器/模式时,它可以毫无问题地工作。所以它只发生在我进行参数优化时。但我不明白为什么。
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
mlp = MLPRegressor(max_iter=100)
par_mlp = {
'hidden_layer_sizes': [(50,50,50), (500,500,500), (100)],
'activation': ['tanh', 'relu'],
'solver': ['sgd','adam'],
'alpha': [0.001,1000],
'learning_rate': ['constant','adaptive']
}
gb=GradientBoostingRegressor()
par_gb={
"loss":["ls", "lad","huber","quantile"],
"learning_rate":[0.001,1000],
"n_estimators":[100,1000],
"criterion": ["friedman_mse", "mse", "mae"],
}
rf=RandomForestRegressor()
par_rf={'criterion':['mse','mae'],
'n_estimators': [100,1000]
}
et=ExtraTreesRegressor()
par_et={'criterion':['mae','mse'],
'n_estimators': [100,1000]
}
sv=SVR()
par_sv={'C': [0.001,1000],
'gamma': [1000,0.001],
'kernel': ["linear", "poly", "rbf", "sigmoid"],
"epsilon":[0.001,1000],
"degree":[1,6]
}
sgd=SGDRegressor()
par_sgd={"loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
"penalty":["l2","l1","elasticnet"],
"alpha":[0.001,1000],
"l1_ratio":[0,1],
"max_iter":[100,1000],
"epsilon":[0.001,1000],
"learning_rate":["constant","optimal","invscaling","adaptive"],
"eta0":[0.001,1000],
"power_t":[0.001,1000]
}
regressors=[gb,rf,et,sv,sgd]
parameterlist=[par_gb,par_rf,par_et,par_sv,par_sgd]
#%%
bestmodellist=[]
bestparameterlist=[]
bestscorelist=[]
X=pd.DataFrame(X)
X=X.reset_index(drop=True)
scaler=StandardScaler()
X=scaler.fit_transform(X)
# imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# X=imp.fit_transform(X)
# imp = SimpleImputer(missing_values=np.inf, strategy='mean')
# X=imp.fit_transform(X)
# imp = SimpleImputer(missing_values=-np.inf, strategy='mean')
# X=imp.fit_transform(X)
X=pd.DataFrame(X)
#%%
i=0
scoring=["r2","neg_root_mean_squared_error","max_error"]
cv=ShuffleSplit(n_splits=3, random_state=0, test_size=0.2)
while i<len(parameterlist):
clf = RandomizedSearchCV(regressors[i], parameterlist[i], n_jobs=1,scoring=scoring[0], cv=cv,n_iter=10,error_score=np.nan)
#clf = GridSearchCV(regressors[i], parameterlist[i], n_jobs=-1, cv=cv)
clf.fit(X, y)
bestmodellist.append(clf.best_estimator_)
bestparameterlist.append(clf.best_params_)
bestscorelist.append(clf.best_score_)
print(i)
i=i+1
Here is the result for the nan and inf check. the code is below.
0
0
0
0
TEST
0
0
0
0
0 0.000000
1 0.000000
2 0.000000
3 0.000000
4 0.000000
5 0.000000
6 0.000000
7 0.000000
8 1.437549
9 0.795685
10 0.115410
11 0.815063
12 0.000000
13 0.000000
14 0.000000
15 0.000000
16 1.437549
17 0.000000
18 0.000000
19 0.000000
dtype: float64
1.7976931348623157e+308
(array([], dtype=int64),)
0
0
0
0
TEST
0
0
0
0
23346469.0
1.7976931348623157e+308
(array([], dtype=int64),)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1225 entries, 0 to 1224
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 1225 non-null float64
1 1 1225 non-null float64
2 2 1225 non-null float64
3 3 1225 non-null float64
4 4 1225 non-null float64
5 5 1225 non-null float64
6 6 1225 non-null float64
7 7 1225 non-null float64
8 8 1225 non-null float64
9 9 1225 non-null float64
10 10 1225 non-null float64
11 11 1225 non-null float64
12 12 1225 non-null float64
13 13 1225 non-null float64
14 14 1225 non-null float64
15 15 1225 non-null float64
16 16 1225 non-null float64
17 17 1225 non-null float64
18 18 1225 non-null float64
19 19 1225 non-null float64
dtypes: float64(20)
memory usage: 191.5 KB
None
#%%
#print(X.isnull().sum().sum())
#print(y.isnull().sum().sum())
X=pd.DataFrame(X)
print(np.isinf(X).all().sum())
print(np.isneginf(X).all().sum())
print(np.isposinf(X).all().sum())
print(np.isnan(X).all().sum())
print("TEST")
print(np.isnan(X).any().sum())
print(np.isinf(X).any().sum())
print(np.isposinf(X).any().sum())
print(np.isneginf(X).any().sum())
print(np.max(X))
print(np.finfo(np.float64).max)
print(np.where(np.max(X)>=np.finfo(np.float64).max))
print(np.isinf(y).all().sum())
print(np.isneginf(y).all().sum())
print(np.isposinf(y).all().sum())
print(np.isnan(y).all().sum())
print("TEST")
print(np.isnan(y).any().sum())
print(np.isinf(y).any().sum())
print(np.isposinf(y).any().sum())
print(np.isneginf(y).any().sum())
print(np.max(y))
print(np.finfo(np.float64).max)
print(np.where(np.max(y)>=np.finfo(np.float64).max))
print(X.info())
尝试从网格中删除 sgd。如果这解决了您的问题,则在此处解释 使用求解器 sgd 时的 MLPRegressor 错误