读取数千个json文件并使用python multiprocessing处理它们



我试图从目录中读取数千个json文件,并分别处理每个文件,并将结果存储在字典中。我已经为顺序执行编写了一个工作代码。现在我想利用多进程的优势来加快整个进程。

So far i do -


import json
import os
from multiprocessing import Process, Manager
def read_file(file_name):
'''
Read the given json file and return data
'''

with open(file_name) as file :
data = json.load(file)

return data
def do_some_process(data):
'''
Some calculation will be done here
and return the result
'''
return some_result
def process_each_file(file, result):
file_name = file.split('.')[0]    
# reading data from file
data = read_file('../data/{}'.format(file))
processed_result = do_some_process(data)

result[file_name] = processed_result

if __name__ == '__main__':

manager = Manager()
result = manager.dict()
file_list = os.listdir("../data")

all_process = [Process(target=process_each_file, args=(file, result, )) 
for file in file_list if file.endswith(".json")]

for p in all_process:
p.start() 

for p in all_process:
p.join() 
'''
Do some further work with 'rusult' variable
'''

当我运行这段代码时,它显示OSError: [Errno 24] Too many open files

我怎样才能达到我的目标?

要使用Python的multiprocessing模块读取和处理多个JSON文件,可以使用以下方法:

import os
import json
from multiprocessing import Pool
# List all the JSON files in the current directory
json_files = [f for f in os.listdir('.') if f.endswith('.json')]
def process_data(data):
return data
def process_json_file(filename):
with open(filename, 'r') as f:
data = json.load(f)
# Process the data here...
processed_data = process_data(data)
return processed_data
# Create a pool of workers to process the files concurrently
with Pool() as pool:
# Apply the processing function to each JSON file concurrently
results = pool.map(process_json_file, json_files)
# Do something with the results
for result in results:
print(result)

最新更新