组合 Pandas GroupBy 并使用多索引数据帧进行转换



我对Python Pandas相当陌生,我很难将PandasGroupBytransform结合起来,以我想要的方式行事。我已经无法找到已发布的答案,但我可能错过了一些东西。

我有一个包含大量条目的数据帧,结构如下:

GLT_City = pd.read_csv('GlobalLandTemperaturesByCity.csv', sep=',')
GLT_City.head()
AvgTemp  AvgTempUncert   City    Country Lat     Long    year    month   day
0   6.068   1.737          Århus    Denmark 57.05N  10.33E  1743    11  01
5   5.788   3.624          Århus    Denmark 57.05N  10.33E  1744    04  01
6   10.644  1.283          Århus    Denmark 57.05N  10.33E  1744    05  01
7   14.051  1.347          Århus    Denmark 57.05N  10.33E  1744    06  01
8   16.082  1.396          Århus    Denmark 57.05N  10.33E  1744    07  01
10  12.781  1.454          Århus    Denmark 57.05N  10.33E  1744    09  01
11  7.950   1.630          Århus    Denmark 57.05N  10.33E  1744    10  01
12  4.639   1.302          Århus    Denmark 57.05N  10.33E  1744    11  01

我想计算每个城市每个月的加权平均温度,并使用transform()以最平滑的方式将其作为新列添加到我的原始数据框中,原因更进一步。

首先,我定义一个函数来计算加权平均值:

def wavg(group,data_name,weight_name, sigma=None):
data = group[data_name]
weight = group[weight_name]
#Check whether we have actual weights or measurement uncertainties
if sigma=='sigma':
weight = 1./weight
try:
return (data * weight).sum() / weight.sum()
except ZeroDivisionError:
return data.mean()

然后,我想结合GroupBytransform(),将此函数应用于我的数据框,并将结果添加为新列,例如:

GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')

现在这会导致复制粘贴下面的非常冗长的错误消息

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError                                  Traceback (most recent call last)
<ipython-input-61-cef679f52b5f> in <module>()
----> 1 GLT_City['WeightedMonthlyMean'] = GLT_City.groupby(['City','month']).transform(wavg, 
'AvgTemp','AvgTemp', sigma='sigma')
~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/groupby.py in transform(self, func, *args, **kwargs)
3814                 result = getattr(self, func)(*args, **kwargs)
3815         else:
-> 3816             return self._transform_general(func, *args, **kwargs)
3817 
3818         # a reduction transform
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _transform_general(self, func, *args, **kwargs)
3765                 # Try slow path and fast path.
3766                 try:
-> 3767                     path, res = self._choose_path(fast_path, slow_path, group)
3768                 except TypeError:
3769                     return self._transform_item_by_item(obj, fast_path)
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in _choose_path(self, fast_path, slow_path, group)
3861     def _choose_path(self, fast_path, slow_path, group):
3862         path = slow_path
-> 3863         res = slow_path(group)
3864 
3865         # if we make it here, test if we can use the fast path
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(group)
3856             fast_path = lambda group: func(group, *args, **kwargs)
3857             slow_path = lambda group: group.apply(
-> 3858                 lambda x: func(x, *args, **kwargs), axis=self.axis)
3859         return fast_path, slow_path
3860 
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
4260                         f, axis,
4261                         reduce=reduce,
-> 4262                         ignore_failures=ignore_failures)
4263             else:
4264                 return self._apply_broadcast(f, axis)
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
4356             try:
4357                 for i, v in enumerate(series_gen):
-> 4358                     results[i] = func(v)
4359                     keys.append(v.name)
4360             except Exception as e:
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
3856             fast_path = lambda group: func(group, *args, **kwargs)
3857             slow_path = lambda group: group.apply(
-> 3858                 lambda x: func(x, *args, **kwargs), axis=self.axis)
3859         return fast_path, slow_path
3860 
<ipython-input-58-181ef4bb1f30> in wavg(group, data_name, weight_name, sigma)
10 
11     #Extracting data and weights.
---> 12     data = group[data_name]
13     weight = group[weight_name]
14     #Check whether we have actual weights, or measurement uncertainties
~/anaconda/envs/python36/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
599         key = com._apply_if_callable(key, self)
600         try:
--> 601             result = self.index.get_value(self, key)
602 
603             if not is_scalar(result):
~/anaconda/envs/python36/lib/python3.6/site-
packages/pandas/core/indexes/base.py in get_value(self, series, key)
2475         try:
2476             return self._engine.get_value(s, k,
-> 2477                                           
tz=getattr(series.dtype, 'tz', None))
2478         except KeyError as e1:
2479             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: ('AvgTemp', 'occurred at index AvgTemp')

所以这显然是行不通的,但我不清楚为什么。任何指示/解决方案都将非常受欢迎。

我可以使用apply()方法来获得所需的输出,但由于我对组进行平均,因此我无法真正将其与原始数据帧合并,因为apply()生成的序列将具有不同的大小。

transform函数分别应用于每个组列。 将print语句放入wavg将帮助您查看问题:

def wavg(group,data_name,weight_name, sigma=None):
print(group)
...
df['WeightedMonthlyMean'] = df.groupby(['City','month']).transform(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma')

指纹

1    5.788
Name: AvgTemp, dtype: object

在提高KeyError之前.这表明group只是一个系列,而不是整个(组(数据帧。

因此,请使用apply,然后将result合并回df

result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)

例如

import pandas as pd
df = pd.DataFrame({'AvgTemp': [6.068, 5.787999999999999, 10.644, 14.050999999999998, 16.082, 12.780999999999999, 7.95, 4.638999999999999], 'AvgTempUncert': [1.7369999999999999, 3.6239999999999997, 1.2830000000000001, 1.347, 1.396, 1.454, 1.63, 1.3019999999999998], 'City': ['Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus', 'Århus'], 'Country': ['Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark', 'Denmark'], 'Lat': ['57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N', '57.05N'], 'Long': ['10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E', '10.33E'], 'day': [1, 1, 1, 1, 1, 1, 1, 1], 'month': [11, 4, 5, 6, 7, 9, 10, 11], 'year': [1743, 1744, 1744, 1744, 1744, 1744, 1744, 1744]}) 
def wavg(group,data_name,weight_name, sigma=None):
data = group[data_name]
weight = group[weight_name]
#Check whether we have actual weights or measurement uncertainties
if sigma=='sigma':
weight = 1./weight
try:
return (data * weight).sum() / weight.sum()
except ZeroDivisionError:
return data.mean()
result = df.groupby(['City','month']).apply(wavg, 'AvgTemp','AvgTempUncert', sigma='sigma').reset_index(name='wavg')
result = pd.merge(df, result)
print(result)

收益 率

AvgTemp  AvgTempUncert   City  Country     Lat    Long  day  month  year       wavg  
0    6.068          1.737  Århus  Denmark  57.05N  10.33E    1     11  1743   5.251227   
1    4.639          1.302  Århus  Denmark  57.05N  10.33E    1     11  1744   5.251227   
2    5.788          3.624  Århus  Denmark  57.05N  10.33E    1      4  1744   5.788000   
3   10.644          1.283  Århus  Denmark  57.05N  10.33E    1      5  1744  10.644000   
4   14.051          1.347  Århus  Denmark  57.05N  10.33E    1      6  1744  14.051000   
5   16.082          1.396  Århus  Denmark  57.05N  10.33E    1      7  1744  16.082000   
6   12.781          1.454  Århus  Denmark  57.05N  10.33E    1      9  1744  12.781000   
7    7.950          1.630  Århus  Denmark  57.05N  10.33E    1     10  1744   7.950000   

使用apply然后merge将其放入同一DataFrame怎么样?例:

import numpy as np
import pandas as pd
data = pd.DataFrame({'City': np.random.randint(0, 4, 1000), 'Month': np.random.randint(1, 12, 1000), 'T': np.random.randn(1000)})
pd.merge(data, data.groupby(['City', 'Month']).apply(lambda x: x['T']*2).reset_index()[['City', 'Month', 'T']].rename(columns={'T': 'WeightedT'}), left_on=['City', 'Month'], right_on=['City', 'Month'])

最新更新