我正试图从pandas数据读取器库下载股价,并根据我提供的股票代码计算(每日、每周、每月等(回报。
下载数据后,我对这些数据的分布执行kstest,并根据提供的p值评估它是否类似于双正态分布(两个正态分布的和(。
由于我只为这个分布执行一个kstest,我想利用Python中的"最小化"库最大化p值(最小化-p值(,改变这两个分布的平均值、标准差和权重。
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.optimize import minimize
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
from pandas_datareader import data
import time
import xlwt
import matplotlib.ticker as mtick
from sklearn import datasets
def Puxa_Preco(ticker,start_date,end_date,lag):
dados= data.get_data_yahoo(ticker, start_date, end_date )
from sklearn import datasets
data_set = np.log(dados['Close'])-np.log(dados['Close'] .shift(lag))
data_set = data_set.fillna(method='ffill')
data_set = data_set.dropna()
y = pd.DataFrame()
y=data_set
x = np.arange(len(y))
size = len(y)
print(y)
return y
def mixnormal_cdf(distribuicao, weight1, mean1, stdv1,weight2, mean2, stdv2):
"""
CDF of a mixture of two normal distributions.
"""
return (weight1*st.norm.cdf(distribuicao, mean1, stdv1) +
weight2*st.norm.cdf(distribuicao, mean2, stdv2))
def Objetivo(X,distribuicao):
peso_dist_1 = X[0]
mi1 = X[1]
sigma1 = X[2]
peso_dist_2 = 1-X[0]
mi2 = X[3]
sigma2 = X[4]
stat2, pvalue = st.kstest(distribuicao, cdf=mixnormal_cdf,
args=(peso_dist_1, mi1, sigma1,peso_dist_2, mi2, sigma2))
''' Kolmogorov-Smirnov Test, to test whether or not the data is from a given distribution. The
returned p-value indicates the probability that the data is from the given distribution,
i.e. a low p-value means the data are likely not from the tested distribution.
Note that, for this test, it is necessary to specify shape, location, and scale parameters,
to obtain meaningful results (c,loc,scale).
stat2: the test statistic, e.g. the max distance between the
cumulated distributions '''
return -pvalue
ticker = 'PETR4.SA'
start_date = '2010-01-02' #yyyy-mm-dd
end_date = '2015-01-02'
for lag in range(1,503):
distribuicao = Puxa_Preco(ticker,start_date,end_date,lag)
n = len(distribuicao)
ChuteInicial=[0.3,0.0010,0.0010,-0.0030,0.0830] #peso_dist_1, mi1, sigma1, mi2, sigma2
test = [0.2,0.0020,0.0110,0.8,-0.0020,0.0230]
Limites = ([0,1],[-50,+50],[0,+50],[0,1],[-50,+50],[0,+50]) #peso_dist_1, mi1, sigma1, peso_dist_2,mi2, sigma2
print("------------------------------------------------------------------------------------------------")
print("Validation Test:")
print(-Objetivo(test,distribuicao)) #the value should be around -0.90 to verify that the objective function it is ok
solution = minimize(fun=Objetivo,x0=ChuteInicial,args=distribuicao,method='SLSQP',bounds = Limites) #minimize - p-valor
print("------------------------------------------------------------------------------------------------")
print("solution:")
print(solution)
找到以下解决方案:
fun: -8.098252265651002e-53
jac: array([-2.13080032e-35, 0.00000000e+00, 0.00000000e+00, -1.93307671e-34, 7.91878934e-35])
message: 'Optimization terminated successfully.'
nfev: 8
nit: 1
njev: 1
status: 6
success: True
x: array([ 0.3 , 0.001, 0.001, -0.003, 0.083])
但我知道正确的答案应该是(测试(:[0.2,0.0020,0.0110,0.8,-0.0020,0.0230]产生0.90 的p值
在我看来,它只运行了几次模拟,因为它没有改变p值,所以它停止了。
是否有一种方法可以确保只有在发现p值大于0.9后,"最小化"才会停止?有人能帮帮我吗
我尝试使用考虑Nelder-Mead的最小化方法,看起来更准确,但甚至不接近应该是答案的0.9 p值,我不知道Nelder-米德方法是否考虑了我提供的极限。
#solution = minimize(fun=Objetivo,x0=(ChuteInicial),args=distribuicao,method='Nelder-Mead',bounds = Limites,options={'int':1000000})
通过能够最小化k-s统计,而不是p值,以及在定义cdf函数时的其他修改,我认为我能够估计参数。这是我的代码和优化的参数估计。我从这篇论文中得到了最小化k-s统计的想法(https://www.researchgate.net/publication/250298633_Minimum_Kolmogorov-Smirnov_testrongtatistic_parameter_estimates)
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.optimize import minimize
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
from pandas_datareader import data
import time
import xlwt
import matplotlib.ticker as mtick
from sklearn import datasets
def Puxa_Preco(ticker,start_date,end_date,lag):
dados= data.get_data_yahoo(ticker, start_date, end_date )
data_set = np.log(dados['Close'])-np.log(dados['Close'] .shift(lag))
data_set = data_set.fillna(method='ffill')
data_set = data_set.dropna()
y = pd.DataFrame()
y=data_set
x = np.arange(len(y))
size = len(y)
return y
def mixnormal_cdf(distribuicao, weight1, mean1, stdv1,mean2, stdv2):
## CDF of a mixture of two normal distributions.
return (weight1*st.norm.cdf(distribuicao, mean1, stdv1) +
(1-weight1)*st.norm.cdf(distribuicao, mean2, stdv2))
def Objetivo(X, distribuicao):
stat2, pvalue = st.kstest(distribuicao, cdf=mixnormal_cdf, args=X)
return stat2
ticker = 'PETR4.SA'
start_date = '2010-01-02' #yyyy-mm-dd
end_date = '2015-01-02'
for lag in range(1,10):
distribuicao = Puxa_Preco(ticker,start_date,end_date,lag)
ChuteInicial=[0.2,0.0020,0.0110,-0.0020,0.0230] #peso_dist_1, mi1, sigma1, mi2, sigma2
Limites = ([0,1],[-0.1,+0.1],[0,None],[-0.1,0.1],[0,None]) #peso_dist_1, mi1, sigma1, peso_dist_2,mi2, sigma2
print("------------------------------------------------------------------------------------------------")
print("Intial Guess")
print(" weight1: {:.4f}, mean1: {:.4f}, stdv1: {:.4f}, mean2: {:.4f}, stdv2: {:.4f}".format(*ChuteInicial))
ks_stat, p_value = st.kstest(distribuicao, cdf=mixnormal_cdf, args=ChuteInicial)
print("k-s stat: {}, p-value: {}".format(ks_stat, p_value))
print("------------------------------------------------------------------------------------------------")
solution = minimize(fun=Objetivo,x0=ChuteInicial,args=(distribuicao), method='SLSQP',bounds = Limites, options={'ftol':1e-12})
print("Optimized Solution:")
print("------------------------------------------------------------------------------------------------")
print(solution.message)
ks_stat, p_value = st.kstest(distribuicao, cdf=mixnormal_cdf, args=solution.x)
print("Optimized k-s stat: {}, p-value: {}".format(ks_stat, p_value))
print("Optimized weight1: {:.4f}, mean1: {:.4f}, stdv1: {:.4f}, mean2: {:.4f}, stdv2: {:.4f}".format(*solution.x))
print("------------------------------------------------------------------------------------------------")
导致
LAG: 1
------------------------------------------------------------------------------------------------
Intial Guess
weight1: 0.2000, mean1: 0.0020, stdv1: 0.0110, mean2: -0.0020, stdv2: 0.0230
k-s stat: 0.016419395777755863, p-value: 0.9027260234690881
------------------------------------------------------------------------------------------------
Optimized Solution:
------------------------------------------------------------------------------------------------
Optimization terminated successfully.
Optimized k-s stat: 0.014896801186217778, p-value: 0.952748910069967
Optimized weight1: 0.2016, mean1: 0.0016, stdv1: 0.0108, mean2: -0.0017, stdv2: 0.0227
------------------------------------------------------------------------------------------------
LAG: 2
------------------------------------------------------------------------------------------------
Intial Guess
weight1: 0.2000, mean1: 0.0020, stdv1: 0.0110, mean2: -0.0020, stdv2: 0.0230
k-s stat: 0.0791846286223467, p-value: 5.496852766236095e-07
------------------------------------------------------------------------------------------------
Optimized Solution:
------------------------------------------------------------------------------------------------
Optimization terminated successfully.
Optimized k-s stat: 0.013690695822088705, p-value: 0.978164746056248
Optimized weight1: 0.2291, mean1: -0.0047, stdv1: 0.0125, mean2: -0.0012, stdv2: 0.0351
------------------------------------------------------------------------------------------------
LAG: 3
------------------------------------------------------------------------------------------------
Intial Guess
weight1: 0.2000, mean1: 0.0020, stdv1: 0.0110, mean2: -0.0020, stdv2: 0.0230
k-s stat: 0.13436209329730314, p-value: 2.533666062868497e-19
------------------------------------------------------------------------------------------------
Optimized Solution:
------------------------------------------------------------------------------------------------
Optimization terminated successfully.
Optimized k-s stat: 0.01622740259609401, p-value: 0.9106251238446841
Optimized weight1: 0.2638, mean1: -0.0058, stdv1: 0.0199, mean2: -0.0020, stdv2: 0.0433
------------------------------------------------------------------------------------------------
我找到了一种方法来确保这一点,但这不是最智能的方法。。。
for lag in range(1,10):
distribuicao = Puxa_Preco(ticker,start_date,end_date,lag)
delta1 = 0
delta2 = 0
delta3 = 0
for j in range(1,101):
ChuteInicial=[0.05+delta1,0.0010+delta2,0.0010+delta3,-0.0030+delta2,0.0830+delta3] #peso_dist_1, mi1, sigma1, mi2, sigma2
Limites = ([0,1],[-0.5,+0.5],[0,None],[-0.5,0.5],[0,None]) #peso_dist_1, mi1, sigma1, peso_dist_2,mi2, sigma2
print("------------------------------------------------------------------------------------------------")
print("------------------------------------Lag " + str(lag) + " -------------------------------------")
print("------------------------------------------------------------------------------------------------")
print("Intial Guess")
print(" weight1: {:.4f}, mean1: {:.4f}, stdv1: {:.4f}, mean2: {:.4f}, stdv2: {:.4f}".format(*ChuteInicial))
ks_stat, p_value = st.kstest(distribuicao, cdf=mixnormal_cdf, args=ChuteInicial)
print("k-s stat: {}, p-value: {}".format(ks_stat, p_value))
print("------------------------------------------------------------------------------------------------")
solution = minimize(fun=Objetivo,x0=ChuteInicial,args=(distribuicao), method='SLSQP',bounds = Limites) #minimize - p-valor
print("Optimized Solution:")
print("------------------------------------------------------------------------------------------------")
print(solution)
ks_stat, p_value = st.kstest(distribuicao, cdf=mixnormal_cdf, args=solution.x)
print("Optimized k-s stat: {}, p-value: {}".format(ks_stat, p_value))
print("Optimized weight1: {:.4f}, mean1: {:.4f}, stdv1: {:.4f}, mean2: {:.4f}, stdv2: {:.4f}".format(*solution.x))
print("------------------------------------------------------------------------------------------------")
if ((p_value <= 0.9) or (ks_stat >= 0.02)):
delta1 += 0.005
delta2 += 0.001
delta3 += 0.001
else:
break