Pandas和Dask产生不同的结果(因为我认为我在Dask中做错了什么(。我想得到Dask的结果来匹配Pandas的结果。
这个玩具程序应该按原样运行以演示:
import dask
import dask.dataframe as ddf
import pandas as pd
# This creates a toy pd.DataFrame
def get(ii):
x = 2 * ii
return pd.DataFrame.from_dict({'a':[x + 1, x + 2]})
if __name__ == '__main__':
print('Using Pandas')
df1 = get(0)
df2 = get(1)
pandas_df = pd.concat([df1, df2], axis=1)
print(pandas_df)
print('nnUsing Dask')
output = []
for ii in range(2):
output.append(dask.delayed(get)(ii))
temp = ddf.from_delayed(output)
temp2 = ddf.concat([temp], axis=1)
dask_df = temp2.compute()
print(dask_df)
输出:
Using Pandas (this is what I want)
a a
0 1 3
1 2 4
Using Dask (oops what happened here?)
a
0 1
1 2
0 3
1 4
import dask.dataframe as ddf
from dask import delayed
import pandas as pd
def get(ii):
x = 2 * ii
return pd.DataFrame.from_dict({'a':[x + 1, x + 2]})
@delayed()
def make_daskdf(*num):
df_list = []
for i in num:
df_list.append(get(i))
df = pd.concat(df_list, axis=1)
return df
dask_df = make_daskdf(0, 1, 2, 3).compute()
输出:
dask_df
Out[38]:
a a a a
0 1 3 5 7
1 2 4 6 8