用透视表中的中值替换缺失值的函数



我的目标是编写一个函数,用数据透视表提供的中位数'total_income'替换'total_income'列中的缺失值,使用行'education'和'income_type'索引数据透视表。我想用这些中位数来填充,这样这些值就尽可能是最优的。下面是我要测试的内容:

#This is the first 5 rows of the dataframe as a dictionary:
{'index': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
'children': {0: 1, 1: 1, 2: 0, 3: 3, 4: 0},
'days_employed': {0: 8437.673027760233,
1: 4024.803753850451,
2: 5623.422610230956,
3: 4124.747206540018,
4: 340266.07204682194},
'dob_years': {0: 42, 1: 36, 2: 33, 3: 32, 4: 53},
'education': {0: "bachelor's degree",
1: 'secondary education',
2: 'secondary education',
3: 'secondary education',
4: 'secondary education'},
'education_id': {0: 0, 1: 1, 2: 1, 3: 1, 4: 1},
'family_status': {0: 'married',
1: 'married',
2: 'married',
3: 'married',
4: 'civil partnership'},
'family_status_id': {0: 0, 1: 0, 2: 0, 3: 0, 4: 1},
'gender': {0: 'F', 1: 'F', 2: 'M', 3: 'M', 4: 'F'},
'income_type': {0: 'employee',
1: 'employee',
2: 'employee',
3: 'employee',
4: 'retiree'},
'debt': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
'total_income': {0: 'NaN',
1: 17932.802,
2: 'NaN',
3: 'NaN',
4: 'NaN'},
'purpose': {0: 'purchase of the house',
1: 'car purchase',
2: 'purchase of the house',
3: 'supplementary education',
4: 'to have a wedding'},
'age_group': {0: 'adult',
1: 'adult',
2: 'adult',
3: 'adult',
4: 'older adult'}}
#this is the first 5 row of the pivot table as a dictionary:
{"bachelor's degree": {('adult', 'business'): 33702.312999999995,
('adult', 'civil servant'): 28178.519,
('adult', 'employee'): 27096.019,
('adult', 'paternity / maternity leave'): nan,
('adult', 'retiree'): 32491.072},
'graduate degree': {('adult', 'business'): nan,
('adult', 'civil servant'): 17822.756999999998,
('adult', 'employee'): 25161.5835,
('adult', 'paternity / maternity leave'): nan,
('adult', 'retiree'): nan},
'primary education': {('adult', 'business'): 27305.901,
('adult', 'civil servant'): 30545.949,
('adult', 'employee'): 19887.121,
('adult', 'paternity / maternity leave'): nan,
('adult', 'retiree'): 15412.875},
'secondary education': {('adult', 'business'): 26144.03,
('adult', 'civil servant'): 21740.913,
('adult', 'employee'): 22166.627,
('adult', 'paternity / maternity leave'): 8612.661,
('adult', 'retiree'): 21219.068},
'some college': {('adult', 'business'): 31437.118,
('adult', 'civil servant'): 29281.5325,
('adult', 'employee'): 27035.721,
('adult', 'paternity / maternity leave'): nan,
('adult', 'retiree'): 7015.422500000001}}

def fill_income(row):
total_income = row['total_income']
age_group =  row['age_group']
income_type = row['income_type']
education = row['education']
table = df.pivot_table(index=['age_group','income_type' ], columns='education', values='total_income', aggfunc='median')

if total_income == 'NaN':
if age_group =='adult':
return table.loc[education, income_type]
if age_group =='older adult':
return table.loc[education, income_type]
if age_group =='younger adult':
return table.loc[education, income_type]
#an anticipated outcome

print(fill_income(df.loc(1))
22166.6270

谢谢你的帮助!

带注释的代码(无需pivot)

# Change the dtype to numeric
df['total_income'] = df['total_income'].astype(float)
# Calculated median per unique age_group, education and income_type
median = df.groupby(['age_group', 'education', 'income_type'])['total_income'].transform('median')
# Fill the NaN value with median
df['total_income'] = df['total_income'].fillna(median)

index  children  days_employed  dob_years            education  education_id      family_status  family_status_id gender income_type  debt  total_income                  purpose    age_group
0      0         1    8437.673028         42    bachelor's degree             0            married                 0      F    employee     0           NaN    purchase of the house        adult
1      1         1    4024.803754         36  secondary education             1            married                 0      F    employee     0     17932.802             car purchase        adult
2      2         0    5623.422610         33  secondary education             1            married                 0      M    employee     0     17932.802    purchase of the house        adult
3      3         3    4124.747207         32  secondary education             1            married                 0      M    employee     0     17932.802  supplementary education        adult
4      4         0  340266.072047         53  secondary education             1  civil partnership                 1      F     retiree     0           NaN        to have a wedding  older adult

最新更新