如何提取和处理zip文件中的所有文件



我想提取并处理压缩文件中的所有文件?

import re
import zipfile
import pathlib
import pandas as pd

# Download mHealth dataset
def parse(zip_file):
# Extract all the files in output directory
with zipfile.ZipFile(zip_file, "r") as zfile:
for file in zfile.extractall():
if file.is_file():
old_name = file.stem
extension = file.suffix
directory = file.parent
new_name = re.sub("mHealth_", "", old_name) + extension
file = file.rename(pathlib.Path(directory, new_name))
zfile.close()
return file

追溯错误:

Traceback (most recent call last):   
File "C:UsersUserPycharmProjectsalgorithmsproject_kmeans.py", line 47,
in <module>
df_ = parse(zip_file_)   File "C:UsersUserPycharmProjectsalgorithmsproject_kmeans.py", line 12,
in parse
for file in zfile.extractall(): TypeError: 'NoneType' object is not iterable
Process finished with exit code 1

您需要infolist()namelist()而不是extractall()来使用for循环。

extractall()zip中提取文件,但它不提供文件名,因此不能与for循环一起使用。

infolist()namelist()提供文件名,但它会产生其他问题,因为它提供对象ZipInfostring,而不是Path,所以它没有.is_file.stem等。您必须转换为Path

import zipfile
import pathlib
import pandas as pd
# Download mHealth dataset
def parse(zip_file):

results = []

# Extract all the files in output directory
with zipfile.ZipFile(zip_file, "r") as zfile:
zfile.extractall()  # extract

#for filename in zfile.namelist():
#    path = pathlib.Path(filename)
for fileinfo in zfile.infolist():
filename = fileinfo.filename
path = pathlib.Path(filename)
if path.is_file():
old_name = path.stem
extension = path.suffix
directory = path.parent
new_name = old_name.replace("mHealth_", "") + extension
path = path.rename(pathlib.Path(directory, new_name))
print('path:', path)
results.append([filename, new_name])

df = pd.DataFrame(results, columns=['old', 'new'])
return df
df = parse('test.zip')
print(df)

文档:信息列表和提取所有

最新更新