高效重命名具有复杂逻辑的数据帧列



我有两个问题,第一个问题是我想在xlsxconcatsheets,用于以下代码:

import os
import pandas as pd
shared_BM_NL_Q2_DNS = r'Shared_BM_NL_Q2_DNS.xlsx'
sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone']
cols = ['A:AB', 'A:AB', 'A:AB']
df = {}
for ws, c in zip(sheet_names, cols):
df[ws] = pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name = ws, usecols = c)

第二期我想阅读表格中的所有列,而不是使用下面的行:

cols = ['A:AB', 'A:AB', 'A:AB']

请注意:表单中具有相同名称的列

我也想用更好、更短的方式执行如下代码:

# shared_BM_NL_Q2_DNS
shared_BM_NL_Q2_DNS_df1.columns = shared_BM_NL_Q2_DNS_df1.columns.str.replace(' ', '_')
shared_BM_NL_Q2_DNS_df1.columns = shared_BM_NL_Q2_DNS_df1.columns.str.replace('n', '')
shared_BM_NL_Q2_DNS_df1.columns = shared_BM_NL_Q2_DNS_df1.columns.str.replace(r"[^a-zA-Zd_]+", "")
shared_BM_NL_Q2_DNS_df1.columns = map(str.lower, shared_BM_NL_Q2_DNS_df1.columns)
shared_BM_NL_Q2_DNS_df2.columns = shared_BM_NL_Q2_DNS_df2.columns.str.replace(' ', '_')
shared_BM_NL_Q2_DNS_df2.columns = shared_BM_NL_Q2_DNS_df2.columns.str.replace('n', '')
shared_BM_NL_Q2_DNS_df2.columns = shared_BM_NL_Q2_DNS_df2.columns.str.replace(r"[^a-zA-Zd_]+", "")
shared_BM_NL_Q2_DNS_df2.columns = map(str.lower, shared_BM_NL_Q2_DNS_df2.columns)
shared_BM_NL_Q2_DNS_df3.columns = shared_BM_NL_Q2_DNS_df3.columns.str.replace(' ', '_')
shared_BM_NL_Q2_DNS_df3.columns = shared_BM_NL_Q2_DNS_df3.columns.str.replace('n', '')
shared_BM_NL_Q2_DNS_df3.columns = shared_BM_NL_Q2_DNS_df3.columns.str.replace(r"[^a-zA-Zd_]+", "")
shared_BM_NL_Q2_DNS_df3.columns = map(str.lower, shared_BM_NL_Q2_DNS_df3.columns)
dataframes2 = [shared_BM_NL_Q2_DNS_df1, shared_BM_NL_Q2_DNS_df2, shared_BM_NL_Q2_DNS_df3]
join2 = pd.concat(dataframes2).reset_index(drop=True)

并且之前的代码在更新为以下代码之前属于我的旧代码:

import os
import pandas as pd
shared_BM_NL_Q2_DNS = 'Shared_BM_NL_Q2_DNS.xlsx'
shared_BM_NL_Q2_DNS_df1 = pd.read_excel(os.path.join(os.path.dirname(__file__), shared_BM_NL_Q2_DNS), sheet_name='client31_KPN')
shared_BM_NL_Q2_DNS_df2 = pd.read_excel(os.path.join(os.path.dirname(__file__), shared_BM_NL_Q2_DNS), sheet_name='client32_T-Mobile')
shared_BM_NL_Q2_DNS_df3 = pd.read_excel(os.path.join(os.path.dirname(__file__), shared_BM_NL_Q2_DNS), sheet_name='client33_Vodafone')
#shared_BM_NL_Q2_DNS
shared_BM_NL_Q2_DNS_df1.columns = shared_BM_NL_Q2_DNS_df1.columns.str.replace(' ', '_')
shared_BM_NL_Q2_DNS_df1.columns = shared_BM_NL_Q2_DNS_df1.columns.str.replace('n', '')
shared_BM_NL_Q2_DNS_df1.columns = shared_BM_NL_Q2_DNS_df1.columns.str.replace(r"[^a-zA-Zd_]+", "")
shared_BM_NL_Q2_DNS_df1.columns = map(str.lower, shared_BM_NL_Q2_DNS_df1.columns)
shared_BM_NL_Q2_DNS_df2.columns = shared_BM_NL_Q2_DNS_df2.columns.str.replace(' ', '_')
shared_BM_NL_Q2_DNS_df2.columns = shared_BM_NL_Q2_DNS_df2.columns.str.replace('n', '')
shared_BM_NL_Q2_DNS_df2.columns = shared_BM_NL_Q2_DNS_df2.columns.str.replace(r"[^a-zA-Zd_]+", "")
shared_BM_NL_Q2_DNS_df2.columns = map(str.lower, shared_BM_NL_Q2_DNS_df2.columns)
shared_BM_NL_Q2_DNS_df3.columns = shared_BM_NL_Q2_DNS_df3.columns.str.replace(' ', '_')
shared_BM_NL_Q2_DNS_df3.columns = shared_BM_NL_Q2_DNS_df3.columns.str.replace('n', '')
shared_BM_NL_Q2_DNS_df3.columns = shared_BM_NL_Q2_DNS_df3.columns.str.replace(r"[^a-zA-Zd_]+", "")
shared_BM_NL_Q2_DNS_df3.columns = map(str.lower, shared_BM_NL_Q2_DNS_df3.columns)
dataframes2 = [shared_BM_NL_Q2_DNS_df1, shared_BM_NL_Q2_DNS_df2, shared_BM_NL_Q2_DNS_df3]
join2 = pd.concat(dataframes2).reset_index(drop=True)

#编辑:

我试图创建一些接近我想要的东西,如下代码:

for ws, c in zip(sheet_names, cols):
df[ws] = pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name = ws, usecols = c)
df[ws].columns = df[ws].columns.str.replace(' ', '_')
df[ws].columns = df[ws].columns.str.replace('n', '')
df[ws].columns = df[ws].columns.str.replace(r"[^a-zA-Zd_]+", "")
df[ws].columns = map(str.lower, df[ws].columns)
join2 = pd.concat(ws).reset_index(drop=True)

但我发现了以下错误:

Traceback (most recent call last):
File "D:/Python Projects/MyAuditPy/pd_read.py", line 29, in <module>
join2 = pd.concat(ws).reset_index(drop=True)
File "C:UsersDELLAppDataLocalProgramsPythonPython38-32libsite-packagespandascorereshapeconcat.py", line 271, in concat
op = _Concatenator(
File "C:UsersDELLAppDataLocalProgramsPythonPython38-32libsite-packagespandascorereshapeconcat.py", line 306, in __init__
raise TypeError(
TypeError: first argument must be an iterable of pandas objects, you passed an object of type "str"

首先,我尽量避免直接分配.columns属性。出错的风险太大。

我会这么做:


def renamer(c):
# I'm assuming this does what you want. hard to tell without knowing
# what your input and output looks like.
return (
c.strip().split(' ')[-1].lower()
)
df = pd.concat([
pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name=ws, usecols=c)
.rename(columns=renamer)
for ws, c in zip(sheet_names, cols)
], ignore_index=True).reset_index(drop=True)

好的,我想我可能已经像下面的代码一样解决了它,它运行良好:

import os
import pandas as pd
shared_BM_NL_Q2_DNS = r'Shared_BM_NL_Q2_DNS.xlsx'
sheet_names = ['client31_KPN', 'client32_T-Mobile', 'client33_Vodafone']
cols = ['A:AB', 'A:AB', 'A:AB']
df = {}
for ws, c in zip(sheet_names, cols):
df[ws] = pd.read_excel(shared_BM_NL_Q2_DNS, sheet_name = ws, usecols = c)
df[ws].columns = df[ws].columns.str.replace(' ', '_')
df[ws].columns = df[ws].columns.str.replace('n', '')
df[ws].columns = df[ws].columns.str.replace(r"[^a-zA-Zd_]+", "")
df[ws].columns = map(str.lower, df[ws].columns)
join2 = pd.concat(df, ignore_index=True).reset_index(drop=True)
join2.to_csv("shared_BM_NL_Q2_DNS.csv")

请给我一些建议

最新更新