我的问题类似于这个线程在pandas 中从具有多个值的列创建虚拟对象



import pandas as pd
df = pd.DataFrame({'fruit': ['Banana, , Apple, Dragon Fruit,,,', 'Kiwi,', 'Lemon, Apple, Banana', ',']})


Apple  Banana Dragon Fruit    Banana  Kiwi    Lemon
0     1      1        0            1         1     0        0
1     0      0        0            0         0     1        0
2     0      1        1            0         0     0        1
3     0      0        0            0         0     0        0










df = pd.DataFrame({'fruit': ['Banana, , Apple, Dragon Fruit,,,', 'Kiwi,', 'Lemon, Apple, Banana', ',']})
df = dd.from_pandas(df, npartitions=2)  # I made sure to choose a partition count lower than my row number.
col_name_to_split = 'fruit'
def split_col(df: pd.DataFrame, col: str) -> pd.DataFrame:
tmp_df = df[col].str.split(',', expand=True)
df = df.drop(columns=[col])
tmp_df.columns = [f'{col}__{x}' for x in tmp_df.columns]  # dunderscore to distinguish when dropping later and not mix with encoded integers.
anticipated_splits = 7  # Dask will not know how many splits will result from mapping this function (dask can only see the values within the current partition it is iterating over), so you must inform Dask in advance (similar to supplying `meta`).
for col1 in [f'{col}__{num}' for num in range(anticipated_splits)]:
tmp_df[col1] = tmp_df.get(col1, float('nan'))  # Fill in with blank if this partition happens not to have all the newly split columns
df = df.join(tmp_df)
return df
df = df.map_partitions(split_col, col_name_to_split)

split_cols = [col for col in df.columns if f"{col_name_to_split}__" in col]
df = df.categorize(split_cols)  # This will trigger all queued computations so that dask will know how many dummy columns to make.
tmp_dfs = {}
for col in split_cols:
tmp_df = dd.get_dummies(df[col], prefix=col_name_to_split)
tmp_df = tmp_df.map(lambda v: float('nan') if not v else v)  # 
# So that `.combine_first` doesn't overwrite existing `True`s with incoming `False`s.
# Caveat: the one hot columns render as `1.0` and as `True` in different columns. Dunno why. But these values are treated as equivalent
df = df.combine_first(tmp_df)
tmp_dfs[col] = tmp_df
df = df.drop(columns=split_cols)
fruit_ fruit_  fruit_ Apple fruit_ Banana fruit_ Dragon Fruit fruit_Banana  
0   True    True         True           NaN                True         True   
1   True     NaN          NaN           NaN                 NaN          NaN   
2    NaN     NaN         True          True                 NaN          NaN   
3   True     NaN          NaN           NaN                 NaN          NaN   
fruit_Kiwi fruit_Lemon  
0        NaN         NaN  
1       True         NaN  
2        NaN        True  
3        NaN         NaN  

