随机搜索值错误:输入包含 NaN、无穷大或对于 dtype('float64')来说太大的值。但数据是正确的



当我在Python的sklearn中执行Gridsearch或RandomizedSearch时,标题中的错误总是会发生的。 但是我检查了我的数据帧 X 和 y,找不到任何 nans 或 inf。当使用数据集来训练测试正常的回归器/模式时,它可以毫无问题地工作。所以它只发生在我进行参数优化时。但我不明白为什么。

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
mlp = MLPRegressor(max_iter=100)
par_mlp = {
'hidden_layer_sizes': [(50,50,50), (500,500,500), (100)],
'activation': ['tanh', 'relu'],
'solver': ['sgd','adam'],
'alpha': [0.001,1000],
'learning_rate': ['constant','adaptive']
}
gb=GradientBoostingRegressor()
par_gb={
"loss":["ls", "lad","huber","quantile"],
"learning_rate":[0.001,1000],
"n_estimators":[100,1000],
"criterion": ["friedman_mse", "mse", "mae"],

}
rf=RandomForestRegressor()
par_rf={'criterion':['mse','mae'],
'n_estimators': [100,1000]
}
et=ExtraTreesRegressor()
par_et={'criterion':['mae','mse'],
'n_estimators': [100,1000]
}
sv=SVR()
par_sv={'C': [0.001,1000],
'gamma': [1000,0.001],
'kernel': ["linear", "poly", "rbf", "sigmoid"],
"epsilon":[0.001,1000],
"degree":[1,6]
}
sgd=SGDRegressor()
par_sgd={"loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
"penalty":["l2","l1","elasticnet"],
"alpha":[0.001,1000],
"l1_ratio":[0,1],
"max_iter":[100,1000],
"epsilon":[0.001,1000],
"learning_rate":["constant","optimal","invscaling","adaptive"],
"eta0":[0.001,1000],
"power_t":[0.001,1000]

}

regressors=[gb,rf,et,sv,sgd]
parameterlist=[par_gb,par_rf,par_et,par_sv,par_sgd]
#%%
bestmodellist=[]
bestparameterlist=[]
bestscorelist=[]
X=pd.DataFrame(X)
X=X.reset_index(drop=True)
scaler=StandardScaler()
X=scaler.fit_transform(X)
# imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# X=imp.fit_transform(X)
# imp = SimpleImputer(missing_values=np.inf, strategy='mean')
# X=imp.fit_transform(X)
# imp = SimpleImputer(missing_values=-np.inf, strategy='mean')
# X=imp.fit_transform(X)
X=pd.DataFrame(X)
#%%
i=0
scoring=["r2","neg_root_mean_squared_error","max_error"]
cv=ShuffleSplit(n_splits=3, random_state=0, test_size=0.2)
while i<len(parameterlist):
clf = RandomizedSearchCV(regressors[i], parameterlist[i], n_jobs=1,scoring=scoring[0], cv=cv,n_iter=10,error_score=np.nan)
#clf = GridSearchCV(regressors[i], parameterlist[i], n_jobs=-1, cv=cv)
clf.fit(X, y)
bestmodellist.append(clf.best_estimator_)
bestparameterlist.append(clf.best_params_)
bestscorelist.append(clf.best_score_)
print(i)
i=i+1

Here is the result for the nan and inf check. the code is below.
0
0
0
0
TEST
0
0
0
0
0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.000000
6     0.000000
7     0.000000
8     1.437549
9     0.795685
10    0.115410
11    0.815063
12    0.000000
13    0.000000
14    0.000000
15    0.000000
16    1.437549
17    0.000000
18    0.000000
19    0.000000
dtype: float64
1.7976931348623157e+308
(array([], dtype=int64),)
0
0
0
0
TEST
0
0
0
0
23346469.0
1.7976931348623157e+308
(array([], dtype=int64),)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1225 entries, 0 to 1224
Data columns (total 20 columns):
#   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
0   0       1225 non-null   float64
1   1       1225 non-null   float64
2   2       1225 non-null   float64
3   3       1225 non-null   float64
4   4       1225 non-null   float64
5   5       1225 non-null   float64
6   6       1225 non-null   float64
7   7       1225 non-null   float64
8   8       1225 non-null   float64
9   9       1225 non-null   float64
10  10      1225 non-null   float64
11  11      1225 non-null   float64
12  12      1225 non-null   float64
13  13      1225 non-null   float64
14  14      1225 non-null   float64
15  15      1225 non-null   float64
16  16      1225 non-null   float64
17  17      1225 non-null   float64
18  18      1225 non-null   float64
19  19      1225 non-null   float64
dtypes: float64(20)
memory usage: 191.5 KB
None
#%%
#print(X.isnull().sum().sum())
#print(y.isnull().sum().sum())
X=pd.DataFrame(X)
print(np.isinf(X).all().sum())
print(np.isneginf(X).all().sum())
print(np.isposinf(X).all().sum())
print(np.isnan(X).all().sum())
print("TEST")
print(np.isnan(X).any().sum())
print(np.isinf(X).any().sum())
print(np.isposinf(X).any().sum())
print(np.isneginf(X).any().sum())
print(np.max(X))
print(np.finfo(np.float64).max)
print(np.where(np.max(X)>=np.finfo(np.float64).max))
print(np.isinf(y).all().sum())
print(np.isneginf(y).all().sum())
print(np.isposinf(y).all().sum())
print(np.isnan(y).all().sum())
print("TEST")
print(np.isnan(y).any().sum())
print(np.isinf(y).any().sum())
print(np.isposinf(y).any().sum())
print(np.isneginf(y).any().sum())
print(np.max(y))
print(np.finfo(np.float64).max)
print(np.where(np.max(y)>=np.finfo(np.float64).max))
print(X.info())

尝试从网格中删除 sgd。如果这解决了您的问题,则在此处解释 使用求解器 sgd 时的 MLPRegressor 错误

相关内容

最新更新