Python:如何处理文件夹中多个不同类型的文件?



我在一个文件夹中有一个zip文件A001-C-002.zip和一个。xlsx文件HUBMAP B004 codex antibodies metadata.xlsx。首先,我想读取xlsx文件并将其转换为数据帧。接下来,我要处理zip文件中的所有文件。

from pathlib import Path
import pandas as pd
import zipfile
import os
import sys
path = "./../../"
os.chdir(path)
for filename in os.listdir(os.getcwd()):
with open(os.path.join(os.getcwd(), filename), 'r') as f:
with open("HUBMAP B004 codex antibodies metadata.xlsx", 'r') as ab:
ab_df = pd.read_excel(ab)
print(f"Antibody metadata column names:n {ab_df.columns.values}")

# Patient A001
with zipfile.ZipFile(path / "A001-C-002.zip") as z:
for filename in z.namelist():
if not os.path.isdir(filename):
for line in z.open(filename):
print(line)
z.close()  

回溯

> --------------------------------------------------------------------------- UnicodeDecodeError                        Traceback (most recent call
> last) /tmp/ipykernel_3212/4008185006.py in <module>
>       2     with open(os.path.join(os.getcwd(), filename), 'r') as f:
>       3         with open("HUBMAP B004 codex antibodies metadata.xlsx", 'r') as ab:
> ----> 4             ab_df = pd.read_excel(ab)
>       5             print(f"Antibody metadata column names:n {ab_df.columns.values}")
>       6 
> 
> ~/.local/lib/python3.8/site-packages/pandas/util/_decorators.py in
> wrapper(*args, **kwargs)
>     309                     stacklevel=stacklevel,
>     310                 )
> --> 311             return func(*args, **kwargs)
>     312 
>     313         return wrapper
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/excel/_base.py in
> read_excel(io, sheet_name, header, names, index_col, usecols, squeeze,
> dtype, engine, converters, true_values, false_values, skiprows, nrows,
> na_values, keep_default_na, na_filter, verbose, parse_dates,
> date_parser, thousands, comment, skipfooter, convert_float,
> mangle_dupe_cols, storage_options)
>     362     if not isinstance(io, ExcelFile):
>     363         should_close = True
> --> 364         io = ExcelFile(io, storage_options=storage_options, engine=engine)
>     365     elif engine and engine != io.engine:
>     366         raise ValueError(
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/excel/_base.py in
> __init__(self, path_or_buffer, engine, storage_options)    1189                 ext = "xls"    1190             else:
> -> 1191                 ext = inspect_excel_format(    1192                     content_or_path=path_or_buffer, storage_options=storage_options   
> 1193                 )
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/excel/_base.py in
> inspect_excel_format(content_or_path, storage_options)    1073        
> stream = handle.handle    1074         stream.seek(0)
> -> 1075         buf = stream.read(PEEK_SIZE)    1076         if buf is None:    1077             raise ValueError("stream is empty")
> 
> /usr/lib/python3.8/codecs.py in decode(self, input, final)
>     320         # decode input (taking the buffer into account)
>     321         data = self.buffer + input
> --> 322         (result, consumed) = self._buffer_decode(data, self.errors, final)
>     323         # keep undecoded input until the next call
>     324         self.buffer = data[consumed:]
> 
> UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9a in position
> 15: invalid start byte

对于读取excel文件,如果最终要将其转换为数据帧,则最好使用pandas。所以我找到了解决你问题的办法。这是你阅读xlsx所需的文章。

读取Excel文件的问题

在那篇文章中,他基本上是说用这个代替:

df = pd.read_excel("HUBMAP B004 codex antibodies metadata.xlsx")

最新更新