Python,循环读取Excel文件表,更改标题行号



我有一个循环,用于计算xml的每个工作表中的行数。当我打开xls本身时,计数与python返回给我的不一致。

这是由于第一个工作表标题在第3行。我怎么能改变我的代码读取第一张表只在第3行,忽略前两行?其余的表总是从最上面一行开始,不包含标题。我想计算不包含页眉的第一个工作页的长度。

然而,当我打开excel并计算我的工作表时,我得到

65522 , header starts in row 3, expecting a count of 65520
65520
65520
65520
65520
65520
65520
65520
65520
65520
65520
25427

我的完整代码:

from io import BytesIO
from pathlib import Path
from zipfile import ZipFile
import os
import pandas as pd
from os import walk

def process_files(files: list) -> pd.DataFrame:
file_mapping = {}
for file in files:
#data_mapping = pd.read_excel(BytesIO(ZipFile(file).read(Path(file).stem)), sheet_name=None)

archive = ZipFile(file)
# find file names in the archive which end in `.xls`, `.xlsx`, `.xlsb`, ...
files_in_archive = archive.namelist()
excel_files_in_archive = [
f for f in files_in_archive if Path(f).suffix[:4] == ".xls"
]
# ensure we only have one file (otherwise, loop or choose one somehow)
assert len(excel_files_in_archive) == 1
# read in data
data_mapping = pd.read_excel(
BytesIO(archive.read(excel_files_in_archive[0])),
sheet_name=None, header=None,
)


row_counts = []
for sheet in list(data_mapping.keys()):
if sheet == 'Sheet1':
df = data_mapping.get(sheet)[3:]

else:
df = data_mapping.get(sheet)
row_counts.append(len(df))
print(len(data_mapping.get(sheet)))




file_mapping.update({file: sum(row_counts)})
frame = pd.DataFrame([file_mapping]).transpose().reset_index()
frame.columns = ["file_name", "row_counts"]
return frame

dir_path = r'D:test2022 - 10'


zip_files = []
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.zip'):
zip_files.append(os.path.join(root, file))
df = process_files(zip_files)   #function
有人知道我做错了什么吗?

你只需要使用skiprows参数:https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html

# read in data
data_mapping = pd.read_excel(
BytesIO(archive.read(excel_files_in_archive[0])),
sheet_name=None, header=None, skiprows=2
)

或者不使用skiprows,然后直接切片工作表的数据框:

row_counts = []
for sheet in list(data_mapping.keys()):
if sheet == 'name of first sheet':
df = data_mapping.get(sheet)[3:]
else:
df = data_mapping.get(sheet)
row_counts.append(len(df))
print(len(data_mapping.get(sheet)))
##or based on the location in the list. you don't need to call list() on .keys()
for sheet, i in enumerate(data_mapping.keys()):
if i == 0:
df = data_mapping.get(sheet)[3:]
else:
df = data_mapping.get(sheet)
row_counts.append(len(df))
print(len(data_mapping.get(sheet)))

最新更新