我有以下函数:
def calculateEMAs(df,startIndex,endIndex):
for index,row in df.iterrows():
for i in range (1,51):
if(index-i > 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
print(df)
这种循环需要很长时间来计算数据帧的值,因为每行必须循环50次(大约需要62秒(
我试着从这个问题中使用多处理器池。我的代码现在看起来是这样的:
def calculateEMAs(df,startIndex,endIndex):
for index,row in df.iterrows():
for i in range (startIndex,endIndex):
if(index-i > 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
print(df)
def main():
dfClosePrice= getFileDataframe().to_frame()
pool = Pool()
time0 = time.time()
result1 = pool.apply_async(calculateEMAs,[dfClosePrice,1,10])
result2 = pool.apply_async(calculateEMAs,[dfClosePrice,10,20])
result3 = pool.apply_async(calculateEMAs,[dfClosePrice,20,30])
result4 = pool.apply_async(calculateEMAs,[dfClosePrice,30,40])
result5 = pool.apply_async(calculateEMAs,[dfClosePrice,40,51])
answer1 = result1.get()
answer2 = result2.get()
answer3 = result3.get()
answer4 = result4.get()
answer5 = result5.get()
print(time.time() - time0)
print(dfClosePrice)
我使用不同的for循环值异步运行该函数。这需要19秒才能完成,我可以看到正确打印的每个函数的结果,但dfClosePirce
的最终值是一个只有1列的数据帧(交易结束(,并且每个异步函数的新列不会添加到数据帧中。我怎样才能用正确的方法呢?
使用Numpy矢量化的解决方案
问题
- 行
if(index-i > 0):
应该是if(index-i >= 0):
,否则我们会错过1的差 - 使用"Close"而不是"Trade Close"(对性能无影响,但避免在从web提取数据后重命名列(
代码
import numpy as np
import pandas as pd
def compute_using_np(df, start_index, end_index):
'''
Using numpy to vectorize computation
'''
nrows = len(df)
ncols = end_index - start_index
# container for pairwise differences
pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float)
pair_wise_diff.fill(np.nan)
# Get values of Trading close column as numpy 1D array
values = df['Close'].values
# Compute differences for different offsets
for offset in range(startIndex, endIndex):
# Using numpy to compute vectorized difference (i.e. faster computation)
diff = np.abs(values[offset:] - values[:-offset])/2.0
# Update result
pair_wise_diff[offset:, offset-startIndex] = diff
# Place into DataFrame
columns = ["EMA%d"%i for i in range(start_index, end_index)]
df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
# Add result to df merging on index
return df.join(df_result)
用法
df_result = compute_using_np(df, 1, 51)
性能
摘要
- 发布代码:每个循环37.9 s±143 ms(7次运行的平均值±标准偏差,每个循环1次(
- Numpy代码:每个循环1.56 ms±27.2µs(7次运行的平均值±标准偏差,每次1000个循环(
- 结果:加速2万次
测试代码
import pandas_datareader as dr
import pandas as pd
import numpy as np
def calculateEMAs(df, start_index, end_index):
'''
Posted code changed 1) use Python PEP 8 naming convention,
2) corrected conditional
'''
for index,row in df.iterrows():
for i in range (start_index, end_index):
if(index-i >= 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Close"] - df.iloc[index]["Close"])/2 #replace this with EMA formula
return df
def compute_using_np(df, start_index, end_index):
'''
Using numpy to vectorie computation
'''
nrows = len(df)
ncols = end_index - start_index
# container for pairwise differences
pair_wise_diff = np.empty((nrows, ncols)) #np.zeros((nrows, ncols), dtype = float)
pair_wise_diff.fill(np.nan)
# Get values of Trading close column as numpy 1D array
values = df['Close'].values
# Compute differences for different offsets
for offset in range(start_index, end_index):
# Using numpy to compute vectorized difference (i.e. faster computation)
diff = np.abs(values[offset:] - values[:-offset])/2.0
# Update result
pair_wise_diff[offset:, offset-start_index] = diff
# Place into DataFrame
columns = ["EMA%d"%i for i in range(start_index, end_index)]
df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)
# Add result to df merging on index
return df.join(df_result)
# Get ibm closing stock pricing (777 DataFrame rows)
df = dr.data.get_data_yahoo('ibm', start = '2017-09-01', end = '2020-10-02')
df.reset_index(level=0, inplace = True) # create index which is 0, 1, 2, ...
# Time Original post
df1 = df.copy() # Copy data since operation is inplace
%timeit calculateEMAs(df1, 1, 51) # Jupyter Notebook Magic method
# Time Numpy Version
%timeit compute_using_np(df, 1, 51) # Jupyter Notebook Magic method
# No need to copy since operation is not inplace