OLS Statsmodels公式:将ValueError:零大小的数组返回到没有标识的归约运算最大值



嘿,我正在对几个月内迭代的一些横截面数据进行多个OLS回归。我在第四个月遇到了一个ValueError:零大小的数组到没有标识的最大缩减操作。但我不知道为什么。数据没有NaN,我已经用dropna测试了这一点,因为它是在另一个问题链接中建议的:

df_month.dropna(how='all')

每个月的数据为零是由于数据的标准化,这在前三次迭代中不会造成任何麻烦,只有在第四次迭代中。同样奇怪的是,如果我在第四行执行OLS回归之前停止循环,然后在另一个单元格中手动再次运行回归,它就会正常工作。这个问题可能是由于存储造成的吗?

这是我的代码一个数据复制错误:

import statsmodels.formula.api as smf #ols (minor letters)
import pandas as pd
import numpy as np
df = pd.read_csv('df_all.csv', index_col='Instrument', sep=',', decimal='.')
df.drop(columns='Unnamed: 0', inplace=True)
#Creates an array with all the years to iterate through single year
years = df_all['Date'].dt.year.unique()
df_store = pd.DataFrame(index=[], columns=['year', 'month', 'R2_adj'])
for year in years:
df_year = df_all[df_all['Date'].dt.year == year]
df_year_t1 = df_all[df_all['Date'].dt.year == year+1]
Jan_date = df_year['Date'][0]
year_start = df_year[(df_year['Date'] == Jan_date) & (df_year['HQ'] == 'United States of America')
& (df_year['ESG'] > 0)]
year_start_firms = year_start.index.unique()
df_year_firms = df_year[['Date', 'eTR', 'MC', 'ESG']].loc[year_start_firms]
df_year_t1 = df_year_t1[['Date', 'eTR', 'MC', 'ESG']]

print(year)
print(" ")


#Normalizes the ESG Scores to the interval (0; 1) and substitues it in; 0.5 is the mean
df = df_year_firms[['ESG']]
Normalized_ESG_year = (df - df.min()) / (df.max() - df.min())
df_year_firms_norm =  df_year_firms
df_year_firms_norm[['ESG_norm']] = Normalized_ESG_year.values
df_year_firms_norm = df_year_firms_norm.drop('ESG', axis=1)

df_year_firms_norm = df_year_firms_norm.ffill(axis=0)
df_year_firms_norm.loc[:, 'Month'] = df_year_firms_norm['Date'].dt.month.values.reshape(len(df_year_firms_norm), 1)
df_year_t1.loc[:, 'Month'] = df_year_t1['Date'].dt.month.values.reshape(len(df_year_t1), 1)      
data = pd.merge(df_year_firms_norm, df_year_t1.iloc[:, [0,1,-1]], how='inner', on=['Instrument', 'Month'], suffixes=('_t', '_t1'))
#Resets index
data.reset_index(inplace=True)

#Monthwise iteration within the year loop
for month in range(1, 13, 1):
df_month = data[data['Month'] == month]

#ols by statsmodels.formula.api = smf (ols with minor letters)
#DataFrame input
#smf not sm (statsmodels.formula.api instead of statsmodels.api)
ESG_ols = smf.ols(formula = 'eTR_t1 ~ ESG_norm', data=df_month).fit(cov_type='HAC',cov_kwds={'maxlags':0})

ESG_ols_tstat = ESG_ols.tvalues
ESG_ols_coeff = ESG_ols.params
results_df = pd.DataFrame({
'ESG_ols_coeff':ESG_ols_coeff, 'ESG_ols_tstat':ESG_ols_tstat},                         
index = ['Intercept', 'ESG'])
#Produces the table
df_output = summary_col(
results=[ESG_ols], stars=True, float_format='%0.4f',
model_names=['ESG_ols'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs))}, 
regressor_order = ['Intercept', 'ESG_norm'])
helper = pd.DataFrame(data=[[year, month, ESG_ols.rsquared_adj]], 
columns=['year', 'month', 'R2_adj'])



df_store = df_store.append(helper)
print(month)
break
df_store

我也很高兴在效率方面有任何建议!

根据要求,以下是完整的Traceback:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~AppDataLocalTemp/ipykernel_8408/1348884998.py in <module>
91          #   break
92         #smf not sm (statsmodels.formula.api instead of statsmodels.api)
---> 93         ESG_ols = smf.ols(formula = 'eTR_t1 ~ ESG_norm', data=df_month).fit(cov_type='HAC',cov_kwds={'maxlags':1})
94         #!Assumption: maxlags=0 should be reasonable since we do not have any TS analysis, right?
95         ESG_ols_tstat = ESG_ols.tvalues
~anaconda3libsite-packagesstatsmodelsbasemodel.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
193                        'formula': formula,  # attach formula for unpckling
194                        'design_info': design_info})
--> 195         mod = cls(endog, exog, *args, **kwargs)
196         mod.formula = formula
197 
~anaconda3libsite-packagesstatsmodelsregressionlinear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
870     def __init__(self, endog, exog=None, missing='none', hasconst=None,
871                  **kwargs):
--> 872         super(OLS, self).__init__(endog, exog, missing=missing,
873                                   hasconst=hasconst, **kwargs)
874         if "weights" in self._init_keys:
~anaconda3libsite-packagesstatsmodelsregressionlinear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
701         else:
702             weights = weights.squeeze()
--> 703         super(WLS, self).__init__(endog, exog, missing=missing,
704                                   weights=weights, hasconst=hasconst, **kwargs)
705         nobs = self.exog.shape[0]
~anaconda3libsite-packagesstatsmodelsregressionlinear_model.py in __init__(self, endog, exog, **kwargs)
188     """
189     def __init__(self, endog, exog, **kwargs):
--> 190         super(RegressionModel, self).__init__(endog, exog, **kwargs)
191         self._data_attr.extend(['pinv_wexog', 'weights'])
192 
~anaconda3libsite-packagesstatsmodelsbasemodel.py in __init__(self, endog, exog, **kwargs)
235 
236     def __init__(self, endog, exog=None, **kwargs):
--> 237         super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
238         self.initialize()
239 
~anaconda3libsite-packagesstatsmodelsbasemodel.py in __init__(self, endog, exog, **kwargs)
75         missing = kwargs.pop('missing', 'none')
76         hasconst = kwargs.pop('hasconst', None)
---> 77         self.data = self._handle_data(endog, exog, missing, hasconst,
78                                       **kwargs)
79         self.k_constant = self.data.k_constant
~anaconda3libsite-packagesstatsmodelsbasemodel.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
99 
100     def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 101         data = handle_data(endog, exog, missing, hasconst, **kwargs)
102         # kwargs arrays could have changed, easier to just attach here
103         for key in kwargs:
~anaconda3libsite-packagesstatsmodelsbasedata.py in handle_data(endog, exog, missing, hasconst, **kwargs)
670 
671     klass = handle_data_class_factory(endog, exog)
--> 672     return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
673                  **kwargs)
~anaconda3libsite-packagesstatsmodelsbasedata.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
85         self.const_idx = None
86         self.k_constant = 0
---> 87         self._handle_constant(hasconst)
88         self._check_integrity()
89         self._cache = {}
~anaconda3libsite-packagesstatsmodelsbasedata.py in _handle_constant(self, hasconst)
129             # detect where the constant is
130             check_implicit = False
--> 131             exog_max = np.max(self.exog, axis=0)
132             if not np.isfinite(exog_max).all():
133                 raise MissingDataError('exog contains inf or nans')
<__array_function__ internals> in amax(*args, **kwargs)
~anaconda3libsite-packagesnumpycorefromnumeric.py in amax(a, axis, out, keepdims, initial, where)
2731     5
2732     """
-> 2733     return _wrapreduction(a, np.maximum, 'max', axis, None, out,
2734                           keepdims=keepdims, initial=initial, where=where)
2735 
~anaconda3libsite-packagesnumpycorefromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
85                 return reduction(axis=axis, out=out, **passkwargs)
86 
---> 87     return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
88 
89 
ValueError: zero-size array to reduction operation maximum which has no identity

我不愿意使用这种方法,但它很有效,即使在本应缺乏数据的月份,我仍然会得到结果。我在这里找到了克服空数组的ValueError的想法

try: #Catches the upcoming Value Error
results = smf.ols(formula = 'eTR_t1 ~ ESG_norm', data=df_month).fit(cov_type='HAC',cov_kwds={'maxlags':0})
except ValueError:
pass

感谢@Josef的建议和帮助。

最新更新