同时计算具有不同索引的for循环

我有以下函数：

def calculateEMAs(df,startIndex,endIndex):
for index,row in df.iterrows():
for i in range (1,51):
if(index-i > 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
print(df)

这种循环需要很长时间来计算数据帧的值，因为每行必须循环50次(大约需要62秒(

我试着从这个问题中使用多处理器池。我的代码现在看起来是这样的：

def calculateEMAs(df,startIndex,endIndex):
for index,row in df.iterrows():
for i in range (startIndex,endIndex):
if(index-i > 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Trade Close"] - df.iloc[index]["Trade Close"])/2 #replace this with EMA formula
print(df)

def main():
dfClosePrice= getFileDataframe().to_frame()
pool = Pool()
time0 = time.time()
result1 = pool.apply_async(calculateEMAs,[dfClosePrice,1,10])
result2 = pool.apply_async(calculateEMAs,[dfClosePrice,10,20])
result3 = pool.apply_async(calculateEMAs,[dfClosePrice,20,30])
result4 = pool.apply_async(calculateEMAs,[dfClosePrice,30,40])
result5 = pool.apply_async(calculateEMAs,[dfClosePrice,40,51])
answer1 = result1.get()
answer2 = result2.get()
answer3 = result3.get()
answer4 = result4.get()
answer5 = result5.get()
print(time.time() - time0)
print(dfClosePrice)

我使用不同的for循环值异步运行该函数。这需要19秒才能完成，我可以看到正确打印的每个函数的结果，但dfClosePirce的最终值是一个只有1列的数据帧(交易结束(，并且每个异步函数的新列不会添加到数据帧中。我怎样才能用正确的方法呢？

使用Numpy矢量化的解决方案

问题

行if(index-i > 0):应该是if(index-i >= 0):，否则我们会错过1的差
使用"Close"而不是"Trade Close"(对性能无影响，但避免在从web提取数据后重命名列(

代码

import numpy as np
import pandas as pd
def compute_using_np(df, start_index, end_index):
'''
Using numpy to vectorize computation
'''
nrows = len(df)                         
ncols = end_index - start_index
# container for pairwise differences
pair_wise_diff = np.empty((nrows, ncols))  #np.zeros((nrows, ncols), dtype = float)
pair_wise_diff.fill(np.nan)
# Get values of Trading close column as numpy 1D array
values = df['Close'].values
# Compute differences for different offsets
for offset in range(startIndex, endIndex):
# Using numpy to compute vectorized difference (i.e. faster computation)
diff = np.abs(values[offset:] - values[:-offset])/2.0

# Update result
pair_wise_diff[offset:, offset-startIndex] = diff

# Place into DataFrame
columns = ["EMA%d"%i for i in range(start_index, end_index)]

df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)

# Add result to df merging on index
return df.join(df_result)

用法

df_result = compute_using_np(df, 1, 51)

性能

摘要

发布代码：每个循环37.9 s±143 ms(7次运行的平均值±标准偏差，每个循环1次(
Numpy代码：每个循环1.56 ms±27.2µs(7次运行的平均值±标准偏差，每次1000个循环(
结果：加速2万次

测试代码

import pandas_datareader as dr
import pandas as pd
import numpy as np
def calculateEMAs(df, start_index, end_index):
'''
Posted code changed 1) use Python PEP 8 naming convention, 
2) corrected conditional
'''
for index,row in df.iterrows():
for i in range (start_index, end_index):
if(index-i >= 0):
df.loc[index,"EMA%d"%i] = abs(df.iloc[index-i]["Close"] - df.iloc[index]["Close"])/2 #replace this with EMA formula
return df
def compute_using_np(df, start_index, end_index):
'''
Using numpy to vectorie computation
'''
nrows = len(df)                         

ncols = end_index - start_index
# container for pairwise differences
pair_wise_diff = np.empty((nrows, ncols))  #np.zeros((nrows, ncols), dtype = float)
pair_wise_diff.fill(np.nan)
# Get values of Trading close column as numpy 1D array
values = df['Close'].values
# Compute differences for different offsets
for offset in range(start_index, end_index):
# Using numpy to compute vectorized difference (i.e. faster computation)
diff = np.abs(values[offset:] - values[:-offset])/2.0

# Update result
pair_wise_diff[offset:, offset-start_index] = diff

# Place into DataFrame
columns = ["EMA%d"%i for i in range(start_index, end_index)]

df_result = pd.DataFrame(data = pair_wise_diff, index = np.arange(nrows), columns = columns)

# Add result to df merging on index
return df.join(df_result)
# Get ibm closing stock pricing (777 DataFrame rows)
df = dr.data.get_data_yahoo('ibm', start = '2017-09-01', end = '2020-10-02')
df.reset_index(level=0, inplace = True)   # create index which is 0, 1, 2, ...
# Time Original post
df1 = df.copy()                    # Copy data since operation is inplace
%timeit calculateEMAs(df1, 1, 51)  # Jupyter Notebook Magic method
# Time Numpy Version
%timeit compute_using_np(df, 1, 51)  # Jupyter Notebook Magic method 
# No need to copy since operation is not inplace

相关内容

最新更新

热门标签：