读取gzip文件时出现非gzip文件错误



我保存了一个gzip镶木地板文件:

df.to_parquet("meth_450_clin_all_kipan.parquet.gz", compression="gzip")

然后我想把它加载为矩阵:

matrix = pd.read_table('../input/meth-clin-kipan/meth_450_clin_all_kipan.parquet.gz')

追溯:

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
/tmp/ipykernel_18/3894087199.py in <module>
----> 4 matrix = pd.read_table('../input/meth-clin-kipan/meth_450_clin_all_kipan.parquet.gz')
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309                     stacklevel=stacklevel,
310                 )
--> 311             return func(*args, **kwargs)
312 
313         return wrapper
/opt/conda/lib/python3.7/site-packages/pandas/io/parsers/readers.py in read_table(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, encoding_errors, delim_whitespace, low_memory, memory_map, float_precision)
681     kwds.update(kwds_defaults)
682 
--> 683     return _read(filepath_or_buffer, kwds)
684 
685 
/opt/conda/lib/python3.7/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
480 
481     # Create the parser.
--> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
483 
484     if chunksize or iterator:
/opt/conda/lib/python3.7/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
809             self.options["has_index_names"] = kwds["has_index_names"]
810 
--> 811         self._engine = self._make_engine(self.engine)
812 
813     def close(self):
/opt/conda/lib/python3.7/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
1038             )
1039         # error: Too many arguments for "ParserBase"
-> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
1041 
1042     def _failover_to_python(self):
/opt/conda/lib/python3.7/site-packages/pandas/io/parsers/c_parser_wrapper.py in __init__(self, src, **kwds)
67         kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
68         try:
---> 69             self._reader = parsers.TextReader(self.handles.handle, **kwds)
70         except Exception:
71             self.handles.close()
/opt/conda/lib/python3.7/site-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
/opt/conda/lib/python3.7/site-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._get_header()
/opt/conda/lib/python3.7/site-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows()
/opt/conda/lib/python3.7/site-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.raise_parser_error()
/opt/conda/lib/python3.7/_compression.py in readinto(self, b)
66     def readinto(self, b):
67         with memoryview(b) as view, view.cast("B") as byte_view:
---> 68             data = self.read(len(byte_view))
69             byte_view[:len(data)] = data
70         return len(data)
/opt/conda/lib/python3.7/gzip.py in read(self, size)
472                 # jump to the next member, if there is one.
473                 self._init_read()
--> 474                 if not self._read_gzip_header():
475                     self._size = self._pos
476                     return b""
/opt/conda/lib/python3.7/gzip.py in _read_gzip_header(self)
420 
421         if magic != b'37213':
--> 422             raise OSError('Not a gzipped file (%r)' % magic)
423 
424         (method, flag,
OSError: Not a gzipped file (b'PA')

解决方案是使用read_parquet读取文件,然后将其转换为numpy数组。

matrix = pd.read_parquet('../input/meth-clin-kipan/meth_450_clin_all_kipan.parquet.gz').to_numpy()

最新更新