多处理读取多个文件



我有一个简单的函数,可以扫描文件中的特殊字符串,但由于这些文件位于缓慢的远程文件存储中,我需要并行扫描它们。

我想我需要使用多处理,但我不确定如何正确地做到这一点。

这是我的功能:

from fnmatch import fnmatch
import os
from shutil import copy
from pprint import pprint
def getFailedFile(directory_name, folder_to_write):
for file in os.listdir(directory_name):
if fnmatch(file, '*Response.txt'):
filename = directory_name + file
try:
with open(filename, 'r', encoding='utf-8') as myfile:
data = myfile.read()
if data.find('Exception') != -1:
try:
requestFile = directory_name + file.replace('Response', 'Request')
copy(requestFile, os.getcwd() + folder_to_write)
except FileNotFoundError:
print('no such file - ', requestFile)
except UnicodeDecodeError:
print('error unicode decode -', filename)
directory_name = 'some folder'
folder_to_write = 'some folder_to_write'
getFailedFile(directory_name=directory_name, folder_to_write)

请帮忙。由于目标文件夹中的文件数量,目前大约需要4个小时。

终于找到了如何做到这一点:

from fnmatch import fnmatch
import os
from shutil import copy
from multiprocessing import Pool
import time
import logging
def process_file(file):
directory_name = 'directory with files'
if fnmatch(file, '*Response.txt'):
filename = directory_name + file
try:
with open(filename, 'r', encoding='utf-8') as myfile:
data = myfile.read()
if data.find('xception') != -1:
try:
requestFile = directory_name + file.replace('Response', 'Request')
responseFile = directory_name + file
try:
copy(requestFile, 'directory to write')
copy(responseFile, 'directory to write')
except Exception as e:
logging.info(str(e) + 'n')
print(str(e))
except FileNotFoundError:
print('no such file - ', requestFile)
logging.info('no such file - ' + str(requestFile) + 'n')
except UnicodeDecodeError:
print('error unicode decode -', filename)
logging.info('error unicode decode -' + str(filename) + 'n')
if __name__ == '__main__':
try:
directory_name = 'directory with files'
number_of_processes = 50
logging.info('n' + 'Number of processes - ' + str(number_of_processes))
logging.info('Directory to scan ' + directory_name)
pool = Pool(number_of_processes)
start_time = time.time()
pool.map(process_file, os.listdir(directory_name))  
pool.close()
elapsed_time = time.time() - start_time
logging.info('Elapsed time - ' + str(elapsed_time / 60) + 'n')
except Exception as e:
logging.info(str(e) + 'n')

我知道代码不是很漂亮,但它的工作时间是27分钟,而不是以前的运行时间。

最新更新