使用 csv+pandas+python 进行多处理



我编写了一个代码,该代码将迭代文件夹中的每个csv,使用数据帧读取它并将其附加到主df,该主df将成为以后的用户。

import glob
import os
import pandas as pd
import time
import multiprocessing as mp
from multiprocessing.dummy import Pool

constituent_df= pd.DataFrame()
def process(file):
'''
This Function reads csv and appends it to a global data-frame
Parameters:
file-> csv file
'''
fields= ('REGION', 'CURR')
print("Pandas Reading:", file)
csv_df= pd.read_csv(file, skipinitialspace=True, usecols=fields)
constituent_df= constituent_df.append(csv_df, ignore_index=True)
def main():
'''
This module reads files present in the directory
And 
'''
pool = mp.Pool(processes=4)
start= time.time()
constituent_df= pd.DataFrame()
for file in glob.glob(os.path.join(os.getcwd(),'2653AM\*.csv')):
pool.apply_async(process,[file])
pool.close()
pool.join()   
end= time.time()
print("It took:", end-start)
print(constituent_df)
constituent_df.to_excel(excel_writer="Constituent_Data_MP.xlsx", index=False)
if __name__=='__main__':
main()
#print(constituent_df)

我无法拯救constituent_df。谁能指导我如何存储constituent_df?还有其他办法吗?

我修改了我想在池中处理文件的方式,并得到了一个有效的解决方案:

def main():
'''
This module reads files present in the directory
And 
'''
file_list=[]
constituent_df= pd.DataFrame()
start= time.time()
for file in glob.glob(os.path.join(os.getcwd(),'2653AM\*.csv')):
file_list.append(file)
with Pool(processes=4) as pool:
df_list = pool.map(process_csv, file_list)
constituent_df = pd.concat(df_list, ignore_index=True)
end= time.time()
print("It took:", end-start)

最新更新