假设你有一个这样的pandas.DataFrame
:
import pandas as pd
import numpy as np
from math import exp, log10
df = pd.DataFrame({'S' : ['C', 'A', 'A', 'B', 'B', 'A'],
'ID': ['ID3', 'ID1', 'ID2', 'ID4', 'ID4', 'ID1'],
'M' : ['Y', 'X', 'X', 'Y', 'Z', 'X'],
'V' : ['<1', '<0.5', 3, '>10', 7, 6]
})
df
# S ID M V
#0 C ID3 Y <1
#1 A ID1 X <0.5
#2 A ID2 X 3
#3 B ID4 Y >10
#4 B ID4 Z 7
#5 A ID1 X 6
以及将每个M
映射到 x 函数的字典,例如:
M_to_transf_dict = {'X' : 'x', 'Y' : 'exp(x)', 'Z' : 'log10(x)'}
这是您需要做的:
- 将
V
拆分为数字部分(V_u
)和限定符(V_c
:"<",">",如果尚未成为数字或可解释) - 按
S, M
分组,对于每个组:- 计算
V_u
的平均V_mean
、V_N
计数和样本标准差V_sample_sd
- 制作唯一
ID
的逗号分隔列表,并将结果存储到ID
- 查找
V_c
中最常用的"<"或">"限定符,并将结果存储到V_mean_c
- 应用每个
M
对应的x函数进行V_mean
,并将结果存储到TV_mean
- 在"<"或">"时
V_mean
和TV_mean
上应用V_mean_c
- 计算
经过多次尝试,我整理了一些似乎有效的东西,但我对性能有疑问.
我看到一些帖子(例如这个)质疑.apply
的使用,事实上,从一些测试来看,将V
拆分为外部映射后进行列分配比.apply
快得多:
def unc(v):
try:
u = float(v)
c = "="
except ValueError:
v = str(v)
u = float(v[1:])
c = v[0]
except:
u = pd.NA
c = "="
return [u, c]
%timeit df[['V_u', 'V_c']] = df.apply(lambda row : unc(row['V']), axis = 1, result_type = 'expand')
# 698 µs ± 5.22 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%%timeit
uc = df['V'].map(unc)
u = [uci[0] for uci in uc.values]
c = [uci[1] for uci in uc.values]
df['V_u'] = u
df['V_c'] = c
# 129 µs ± 3.59 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%%timeit
uc = df['V'].map(unc)
u, c = [], []
for uci in uc:
u.append(uci[0])
c.append(uci[1])
df['V_u'] = u
df['V_c'] = c
# 124 µs ± 1.11 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
(请注意,我在每次timeit
之前都重制了df
)。
对于我描述的其他操作,除了最后两个操作,我使用了带有命名聚合的.groupby.agg
:
def majority(c_list):
c_list = list(c_list)
Nlt = c_list.count('<')
Ngt = c_list.count('>')
if Nlt + Ngt == 0:
c = '='
else:
c = sorted(zip([Nlt, Ngt], ['<','>']))[1][1]
return c
%%timeit
df_summary_by_S_M = df.groupby(['S','M'], as_index = False).agg(
ID = pd.NamedAgg(column = 'ID', aggfunc = lambda x : ','.join(np.unique(x))),
V_mean = pd.NamedAgg(column = 'V_u', aggfunc = 'mean'),
V_mean_c = pd.NamedAgg(column = 'V_c', aggfunc = majority),
V_N = pd.NamedAgg(column = 'V_u', aggfunc = 'count'),
V_sample_sd = pd.NamedAgg(column = 'V_u', aggfunc = 'std')
)
# 4.17 ms ± 61.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
df_summary_by_S_M
# S M ID V_mean V_mean_c V_N V_sample_sd
#0 A X ID1,ID2 3.166667 < 3 2.753785
#1 B Y ID4 10.000000 > 1 NaN
#2 B Z ID4 7.000000 = 1 NaN
#3 C Y ID3 1.000000 < 1 NaN
我不知道如何放置这个时间,特别是因为我想.agg
是一种.apply
形式,所以我可能没有使用最好的方法,看过之前的结果.
在我正在处理的真实数据集上,大约有 450 K 条记录,通过分组减少到 230 K, 此聚合大约需要 1 分钟.
我将不得不处理更大的数据集,这可能会成为一个问题。
有人能够建议一种更有效/性能更高的方法来进行我描述的分组计算吗?
或者这被认为是这种操作的最新性能吗?我真的没有基准来判断这一点,因此我在这里提出问题。
对于最后两个步骤,我想我将使用列表推导,例如:
%timeit df_summary_by_S_M['TV_mean'] = [eval(t) for x, t in
zip(df_summary_by_S_M['V_mean'], df_summary_by_S_M['M'].map(M_to_transf_dict))]
# 326 µs ± 5.06 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
df_summary_by_S_M
# S M ID V_mean V_mean_c V_N V_sample_sd TV_mean
#0 A X ID1,ID2 3.166667 < 3 2.753785 3.166667
#1 B Y ID4 10.000000 > 1 NaN 22026.465795
#2 B Z ID4 7.000000 = 1 NaN 0.845098
#3 C Y ID3 1.000000 < 1 NaN 2.718282
和:
df_summary_by_S_M['V_mean'] = [c + str(v) if c != '=' else v for c, v in
zip(df_summary_by_S_M['V_mean_c'], df_summary_by_S_M['V_mean'])]
df_summary_by_S_M['TV_mean'] = [c + str(v) if c != '=' else v for c, v in
zip(df_summary_by_S_M['V_mean_c'], df_summary_by_S_M['TV_mean'])]
df_summary_by_S_M
# S M ID V_mean V_mean_c V_N V_sample_sd
#0 A X ID1,ID2 <3.1666666666666665 < 3 2.753785
#1 B Y ID4 >10.0 > 1 NaN
#2 B Z ID4 7.0 = 1 NaN
#3 C Y ID3 <1.0 < 1 NaN
# TV_mean
#0 <3.1666666666666665
#1 >22026.465794806718
#2 0.845098
#3 <2.718281828459045
同样,除非有人可以提出更有效的替代方案(?
>EDIT试用@mozway的代码原始代码,所有操作一起计时。
import pandas as pd
import numpy as np
from math import exp, log10
df0 = pd.DataFrame({'S' : ['C', 'A', 'A', 'B', 'B', 'A'],
'ID': ['ID3', 'ID1', 'ID2', 'ID4', 'ID4', 'ID1'],
'M' : ['Y', 'X', 'X', 'Y', 'Z', 'X'],
'V' : ['<1', '<0.5', 3, '>10', 7, 6]
})
M_to_transf_dict = {'X' : 'x', 'Y' : 'exp(x)', 'Z' : 'log10(x)'}
def unc(v):
try:
u = float(v)
c = "="
except ValueError:
v = str(v)
u = float(v[1:])
c = v[0]
except:
u = pd.NA
c = "="
return [u, c]
def majority(c_list):
c_list = list(c_list)
Nlt = c_list.count('<')
Ngt = c_list.count('>')
if Nlt + Ngt == 0:
c = '='
else:
c = sorted(zip([Nlt, Ngt], ['<','>']))[1][1]
return c
%%timeit
df = df0.copy()
uc = df['V'].map(unc)
u, c = [], []
for uci in uc:
u.append(uci[0])
c.append(uci[1])
df['V_u'] = u
df['V_c'] = c
df = df.groupby(['S','M'], as_index = False).agg(
ID = pd.NamedAgg(column = 'ID', aggfunc = lambda x : ','.join(np.unique(x))),
V_mean = pd.NamedAgg(column = 'V_u', aggfunc = 'mean'),
V_mean_c = pd.NamedAgg(column = 'V_c', aggfunc = majority),
V_N = pd.NamedAgg(column = 'V_u', aggfunc = 'count'),
V_sample_sd = pd.NamedAgg(column = 'V_u', aggfunc = 'std')
)
df['TV_mean'] = [eval(t) for x, t in
zip(df['V_mean'], df['M'].map(M_to_transf_dict))]
df['V_mean'] = [c + str(v) if c != '=' else v for c, v in
zip(df['V_mean_c'], df['V_mean'])]
df['TV_mean'] = [c + str(v) if c != '=' else v for c, v in
zip(df['V_mean_c'], df['TV_mean'])]
结果:
# 5.29 ms ± 100 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
@mozway的代码。
def majority(s):
mode = s.mode()
return '=' if len(mode)>1 else mode.iloc[0]
M_dic = {'Y' : np.exp, 'Z' : np.log10}
df = df0.copy()
%%timeit
(df
.join(df['V']
.str.extract('(?P<V_c>D)?(?P<V_u>d+(?:.d+)?)')
.astype({'V_u': float}).fillna({'V_c': '=', 'V_u': df['V']})
)
.assign(TV=lambda d: d.groupby('M')['V_u'].apply(lambda g: M_dic[g.name](g) if g.name in M_dic else g))
.groupby(['S','M'], as_index = False)
.agg(**{'ID': ('ID', lambda x: ','.join(x.unique())),
'V_mean': ('V_u', 'mean'),
'V_mean_c': ('V_c', majority), ## FIXME
'V_N': ('V_u', 'count'),
'V_sample_sd': ('V_u', 'std'),
'TV_mean': ('TV', 'mean'),
})
.assign(TV_mean=lambda d: d['V_mean_c'].mask(d['V_mean_c'].eq('='), '')+d['TV_mean'].astype(str))
)
结果:
# 8.05 ms ± 259 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
EDIT 2在更逼真的模拟数据集上重试
import pandas as pd
import numpy as np
from numpy.random import default_rng
from math import exp, log10
# Simulate dataset to process
rng = default_rng(12345)
S = rng.choice(range(250000), 450000)
ID = S.copy()
S = [f"S_{Si}" for Si in S]
ID = [f"ID_{IDi}" for IDi in ID]
pM = [np.sqrt(1+i) for i in range(100)]
pM = pM / np.sum(pM)
M = rng.choice(range(100), 450000, p = pM)
M_to_transf_dict = dict()
for i in range(0,10):
M_to_transf_dict[f"M_{i}"] = 'exp(x)'
for i in range(10,30):
M_to_transf_dict[f"M_{i}"] = 'x'
for i in range(30,100):
M_to_transf_dict[f"M_{i}"] = 'log10(x)'
M = [f"M_{Mi}" for Mi in M]
V = rng.random(450000)
Q = rng.choice(['', '<', '>'], 450000, p = [0.9, 0.05, 0.05])
V = [f"{q}{v}" for q, v in zip(Q, V)]
df0 = pd.DataFrame({'S' : S, 'ID' : ID, 'M' : M, 'V' : V})
原始代码:
def unc(v):
try:
u = float(v)
c = "="
except ValueError:
v = str(v)
u = float(v[1:])
c = v[0]
except:
u = pd.NA
c = "="
return [u, c]
def majority(c_list):
c_list = list(c_list)
Nlt = c_list.count('<')
Ngt = c_list.count('>')
if Nlt + Ngt == 0:
c = '='
else:
c = sorted(zip([Nlt, Ngt], ['<','>']))[1][1]
return c
%%timeit
df = df0.copy()
uc = df['V'].map(unc)
u, c = [], []
for uci in uc:
u.append(uci[0])
c.append(uci[1])
df['V_u'] = u
df['V_c'] = c
df = df.groupby(['S','M'], as_index = False).agg(
ID = pd.NamedAgg(column = 'ID', aggfunc = lambda x : ','.join(np.unique(x))),
V_mean = pd.NamedAgg(column = 'V_u', aggfunc = 'mean'),
V_mean_c = pd.NamedAgg(column = 'V_c', aggfunc = majority),
V_N = pd.NamedAgg(column = 'V_u', aggfunc = 'count'),
V_sample_sd = pd.NamedAgg(column = 'V_u', aggfunc = 'std')
)
df['TV_mean'] = [eval(t) for x, t in
zip(df['V_mean'], df['M'].map(M_to_transf_dict))]
df['V_mean'] = [c + str(v) if c != '=' else v for c, v in
zip(df['V_mean_c'], df['V_mean'])]
df['TV_mean'] = [c + str(v) if c != '=' else v for c, v in
zip(df['V_mean_c'], df['TV_mean'])]
结果:
# 20 s ± 289 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@mozway的代码(M_dic
根据新的M
组而改变):
def majority(s):
mode = s.mode()
return '=' if len(mode)>1 else mode.iloc[0]
M_dic = dict()
for k in M_to_transf_dict:
if M_to_transf_dict[k] == 'log10(x)':
M_dic[k] = np.log10
elif M_to_transf_dict[k] == 'exp(x)':
M_dic[k] = np.exp
df = df0.copy()
%%timeit
(df
.join(df['V']
.str.extract('(?P<V_c>D)?(?P<V_u>d+(?:.d+)?)')
.astype({'V_u': float}).fillna({'V_c': '=', 'V_u': df['V']})
)
.assign(TV=lambda d: d.groupby('M')['V_u'].apply(lambda g: M_dic[g.name](g) if g.name in M_dic else g))
.groupby(['S','M'], as_index = False)
.agg(**{'ID': ('ID', lambda x: ','.join(x.unique())),
'V_mean': ('V_u', 'mean'),
'V_mean_c': ('V_c', majority), ## FIXME
'V_N': ('V_u', 'count'),
'V_sample_sd': ('V_u', 'std'),
'TV_mean': ('TV', 'mean'),
})
.assign(TV_mean=lambda d: d['V_mean_c'].mask(d['V_mean_c'].eq('='), '')+d['TV_mean'].astype(str))
)
结果:
# 52.3 s ± 436 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
EDIT 3在模拟具有重复{S, M}
条目的数据集后重试。
rng = default_rng(12345)
S = rng.choice(range(200000), 225000)
ID = S.copy()
S = [f"S_{Si}" for Si in S]
ID = [f"ID_{IDi}" for IDi in ID]
pM = [np.sqrt(1+i) for i in range(100)]
pM = pM / np.sum(pM)
M = rng.choice(range(100), 225000, p = pM)
M_to_transf_dict = dict()
for i in range(0,10):
M_to_transf_dict[f"M_{i}"] = 'exp(x)'
for i in range(10,30):
M_to_transf_dict[f"M_{i}"] = 'x'
for i in range(30,100):
M_to_transf_dict[f"M_{i}"] = 'log10(x)'
M = [f"M_{Mi}" for Mi in M]
S = S + S
ID = ID + ID
M = M + M
V = rng.random(450000)
Q = rng.choice(['', '<', '>'], 450000, p = [0.9, 0.05, 0.05])
V = [f"{q}{v}" for q, v in zip(Q, V)]
df0 = pd.DataFrame({'S' : S, 'ID' : ID, 'M' : M, 'V' : V})
原始代码:
10.6 s ± 154 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@mozway的代码:
25.8 s ± 550 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
(并且输出看起来不同)。
由于循环、apply
和eval
,您的代码很慢。
这是一种更快的方法(前提是您的小组规模足够大):
def majority(s):
mode = s.mode()
return '=' if len(mode)>1 else mode.iloc[0]
M_dic = {'Y' : np.exp, 'Z' : np.log10}
(df
.join(df['V']
.str.extract('(?P<V_c>D)?(?P<V_u>d+(?:.d+)?)')
.astype({'V_u': float}).fillna({'V_c': '=', 'V_u': df['V']})
)
.assign(TV=lambda d: d.groupby('M')['V_u'].apply(lambda g: M_dic[g.name](g) if g.name in M_dic else g))
.groupby(['S','M'], as_index = False)
.agg(**{'ID': ('ID', lambda x: ','.join(x.unique())),
'V_mean': ('V_u', 'mean'),
'V_mean_c': ('V_c', majority), ## FIXME
'V_N': ('V_u', 'count'),
'V_sample_sd': ('V_u', 'std'),
'TV_mean': ('TV', 'mean'),
})
.assign(TV_mean=lambda d: d['V_mean_c'].mask(d['V_mean_c'].eq('='), '')+d['TV_mean'].astype(str))
)
输出:
S M ID V_mean V_mean_c V_N V_sample_sd TV_mean
0 A X ID1,ID2 3.166667 = 3 2.753785 3.1666666666666665
1 B Y ID4 10.000000 > 1 NaN >22026.465794806718
2 B Z ID4 7.000000 = 1 NaN 0.8450980400142568
3 C Y ID3 1.000000 < 1 NaN <2.718281828459045