在Pandas中,当I join
时,连接的数据相对于原始DataFrame不对齐:
import os
import pandas as pd
import statsmodels.formula.api as sm
import numpy as np
import matplotlib.pyplot as plt
flu_train = pd.read_csv('FluTrain.csv')
# From: https://courses.edx.org/c4x/MITx/15.071x/asset/FluTrain.csv
cols = ['Ystart', 'Mstart', 'Dstart', 'Yend', 'Mend', 'Dend']
flu_train = flu_train.join(pd.DataFrame(flu_train.Week.str.findall('d+').tolist(), dtype=np.int64, columns=cols))
flu_trend_1 = sm.ols('log(ILI) ~ Queries', flu_train).fit()
flu_test = pd.read_csv('FluTest.csv')
# From: https://courses.edx.org/c4x/MITx/15.071x/asset/FluTest.csv
flu_test = flu_test.join(pd.DataFrame(flu_test.Week.str.findall('d+').tolist(), dtype=np.int64, columns=cols))
flu_test = flu_test.join(pd.DataFrame(exp(flu_trend_1.predict(flu_test)), columns=['ILIPred1'] ))
flu_train['ILIShift2'] = flu_train.ILI.shift(2)
flu_trend_2 = sm.ols('log(ILI) ~ Queries + log(ILIShift2)', flu_train).fit()
flu_test['ILIShift2'] = flu_test.ILI.shift(2)
# Note that this does not work in a simplified example
# See -- http://stackoverflow.com/q/22457880/
flu_test[:2].ILIShift2 = list(flu_train[-2:].ILI)
# This SHIFTS the joined column "up" two rows, loosing the first two values of ILIPred2 and making the last 2 'NaN'
flu_test = flu_test.join(pd.DataFrame(exp(flu_trend_2.predict(flu_test)), columns=['ILIPred2']))
最后一条语句将连接的列"上移"两行,失去ILIPred2的前两个值,并使最后两个值成为"NaN"。我期望连接的列与所有其他列对齐。
为什么会发生这种情况,我该如何预防?
这个连接的数据框(pd.DataFrame(np.exp(flu_trend_2.predict(flu_test)), columns=['ILIPred2'])
)的索引从0到49。
您要将其加入索引为0到51的flu_test
。
因此,当这些指数不匹配(50和51)时,您将得到我希望的NaN
。
如果您想强制连接的列位于主数据框的底部,您可以执行以下操作(注意iloc
和row_shift
变量的使用):
import os
import pandas as pd
import statsmodels.formula.api as sm
import numpy as np
import matplotlib.pyplot as plt
row_shift = 2
flu_train = pd.read_csv('https://courses.edx.org/c4x/MITx/15.071x/asset/FluTrain.csv')
cols = ['Ystart', 'Mstart', 'Dstart', 'Yend', 'Mend', 'Dend']
flu_train = flu_train.join(pd.DataFrame(flu_train.Week.str.findall('d+').tolist(), dtype=np.int64, columns=cols))
flu_trend_1 = sm.ols('np.log(ILI) ~ Queries', flu_train).fit()
flu_test = pd.read_csv('https://courses.edx.org/c4x/MITx/15.071x/asset/FluTest.csv')
flu_test = flu_test.join(pd.DataFrame(flu_test.Week.str.findall('d+').tolist(), dtype=np.int64, columns=cols))
flu_test = flu_test.join(pd.DataFrame(np.exp(flu_trend_1.predict(flu_test)), columns=['ILIPred1'] ))
flu_train['ILIShift2'] = flu_train.ILI.shift(row_shift)
flu_trend_2 = sm.ols('np.log(ILI) ~ Queries + np.log(ILIShift2)', flu_train).fit()
flu_test['ILIShift2'] = flu_test.ILI.shift(row_shift)
# Note that this does not work in a simplified example
# See -- http://stackoverflow.com/q/22457880/
flu_test.iloc[:2].ILIShift2 = list(flu_train.iloc[-row_shift:].ILI)
joiner = pd.DataFrame(np.exp(flu_trend_2.predict(flu_test)), columns=['ILIPred2'], index=flu_test.index[row_shift:])
flu_test.join(joiner)
这给了我:
Week ILI Queries Ystart Mstart Dstart Yend Mend Dend ILIPred1 ILIShift2 ILIPred2
0 2012-01-01 - 2012-01-07 1.766707 0.593625 2012 1 1 2012 1 7 3.520332 NaN NaN
1 2012-01-08 - 2012-01-14 1.543401 0.499336 2012 1 8 2012 1 14 2.662689 NaN NaN
2 2012-01-15 - 2012-01-21 1.647615 0.500664 2012 1 15 2012 1 21 2.673181 1.766707 2.140941
3 2012-01-22 - 2012-01-28 1.684297 0.479416 2012 1 22 2012 1 28 2.510160 1.543401 1.907817
4 2012-01-29 - 2012-02-04 1.863542 0.471448 2012 1 29 2012 2 4 2.451624 1.647615 1.971504
5 2012-02-05 - 2012-02-11 1.864079 0.503320 2012 2 5 2012 2 11
...
46 2012-11-18 - 2012-11-24 2.304625 0.511288 2012 11 18 2012 11 24 2.758619 1.610915 2.042260
47 2012-11-25 - 2012-12-01 2.225997 0.609562 2012 11 25 2012 12 1 3.690445 1.733293 2.424141
48 2012-12-02 - 2012-12-08 2.978047 0.671979 2012 12 2 2012 12 8 4.439679 2.304625 3.160283
49 2012-12-09 - 2012-12-15 3.600230 0.705179 2012 12 9 2012 12 15 4.898351 2.225997 3.220680
50 2012-12-16 - 2012-12-22 4.547268 0.787517 2012 12 16 2012 12 22 6.250888 2.978047 4.322513
51 2012-12-23 - 2012-12-29 6.033614 0.805421 2012 12 23 2012 12 29 6.591252 3.600230 5.006438