Python:具有开始日期和结束日期的数据帧,解压缩为1个日期字段



我有一个这样的数据帧*EDITED

StartDate EndDate Company Location
2019-01-15  2019-01-31  1.0 121.0
2019-02-01  2020-03-10  1.0 136.0
2006-10-02  2020-03-10  2.0 136.0
2003-07-31  2020-03-10  2.0 321.0
2010-11-03  2020-03-10  3.0 322.0
2013-02-01  2017-02-07  4.0 375.0
2017-02-08  2019-01-14  4.0 375.0
2019-01-15  2019-04-29  4.0 375.0
2019-04-30  2020-03-10  4.0 375.0

如本链接所述:Pandas:将日期范围解压缩为单个日期我想把它解压到只有一个字段,那就是日期。我一步一步地找到了解决方案。然而,当我尝试用重采样进行分组时,我会得到以下错误:ValueError: cannot reindex a non-unique index with a method or limit发生这种情况的原因是什么?

更清楚地说,这是我的代码(原始数据帧的索引只是正常的索引1、2、3…

df=read_parquet('company_location.parquet')
df=df[['COMPANY','STARTDATE','ENDDATE','LOCATION']]
df['STARTDATE']=pd.to_datetime(df['STARTDATE'])
df['ENDDATE']=pd.to_datetime(df['ENDDATE'])
df=df.dropna(axis=0,how='any')
df['rows']=range(len(df))
starts=df[['COMPANY','STARTDATE','LOCATION','rows']].rename(columns={'STARTDATE':'DATE'})
ends=df[['COMPANY','ENDDATE','LOCATION','rows']].rename(columns={'ENDDATE':'DATE'})
df_decomp=pd.concat([starts,ends])
df_decomp=df_decomp.set_index('rows', append=True)
df_decomp.sort_index()

到这里之前一切都很好。

然后当我写这行的时候,出现了一个错误:

df_decomp=df_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))

错误是:(这是Jupyter笔记本(

ValueError                                Traceback (most recent call last)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
688             try:
--> 689                 result = self._python_apply_general(f)
690             except Exception:
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
706         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707                                                    self.axis)
708 
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
189             group_axes = _get_axes(group)
--> 190             res = f(group)
191             if not _is_indexed_like(res, group_axes):
<ipython-input-29-e5d0ce53cd1c> in <lambda>(x)
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in fillna(self, method, limit)
759         """
--> 760         return self._upsample(method, limit=limit)
761 
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
1072             result = obj.reindex(res_index, method=method,
-> 1073                                  limit=limit, fill_value=fill_value)
1074 
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
196         def wrapper(*args, **kwargs):
--> 197             return func(*args, **kwargs)
198 
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
3808         kwargs.pop('labels', None)
-> 3809         return super(DataFrame, self).reindex(**kwargs)
3810 
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4355         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 4356                                   fill_value, copy).__finalize__(self)
4357 
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3740             frame = frame._reindex_index(index, method, copy, level,
-> 3741                                          fill_value, limit, tolerance)
3742 
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
3748                                                 level=level, limit=limit,
-> 3749                                                 tolerance=tolerance)
3750         return self._reindex_with_indexers({0: [new_index, indexer]},
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
3137                     if method is not None or limit is not None:
-> 3138                         raise ValueError("cannot reindex a non-unique index "
3139                                          "with a method or limit")
ValueError: cannot reindex a non-unique index with a method or limit
During handling of the above exception, another exception occurred:
ValueError                                Traceback (most recent call last)
<ipython-input-29-e5d0ce53cd1c> in <module>()
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
699 
700                 with _group_selection_context(self):
--> 701                     return self._python_apply_general(f)
702 
703         return result
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
705     def _python_apply_general(self, f):
706         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707                                                    self.axis)
708 
709         return self._wrap_applied_output(
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
188             # group might be modified
189             group_axes = _get_axes(group)
--> 190             res = f(group)
191             if not _is_indexed_like(res, group_axes):
192                 mutated = True
<ipython-input-29-e5d0ce53cd1c> in <lambda>(x)
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in fillna(self, method, limit)
758         2018-01-01 02:00:00  6.0  5
759         """
--> 760         return self._upsample(method, limit=limit)
761 
762     @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
1071         else:
1072             result = obj.reindex(res_index, method=method,
-> 1073                                  limit=limit, fill_value=fill_value)
1074 
1075         result = self._apply_loffset(result)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
195         @wraps(func)
196         def wrapper(*args, **kwargs):
--> 197             return func(*args, **kwargs)
198 
199         if not PY2:
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
3807         kwargs.pop('axis', None)
3808         kwargs.pop('labels', None)
-> 3809         return super(DataFrame, self).reindex(**kwargs)
3810 
3811     @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4354         # perform the reindex on the axes
4355         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 4356                                   fill_value, copy).__finalize__(self)
4357 
4358     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3739         if index is not None:
3740             frame = frame._reindex_index(index, method, copy, level,
-> 3741                                          fill_value, limit, tolerance)
3742 
3743         return frame
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
3747         new_index, indexer = self.index.reindex(new_index, method=method,
3748                                                 level=level, limit=limit,
-> 3749                                                 tolerance=tolerance)
3750         return self._reindex_with_indexers({0: [new_index, indexer]},
3751                                            copy=copy, fill_value=fill_value,
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
3136                 else:
3137                     if method is not None or limit is not None:
-> 3138                         raise ValueError("cannot reindex a non-unique index "
3139                                          "with a method or limit")
3140                     indexer, missing = self.get_indexer_non_unique(target)
ValueError: cannot reindex a non-unique index with a method or limit

我能够遵循Pandas:将日期范围解压缩到各个日期,并且我在您的数据集上没有得到任何错误。请参阅答案中的以下代码

import pandas as pd
df = pd.DataFrame([['2019-01-15','2019-01-31','A',121.0],
['2019-02-01','2020-03-10','A',136.0],
['2006-10-02','2020-03-10','B',136.0],
['2003-07-31','2020-03-10','B',321.0],
['2010-11-03','2020-03-10','C',322.0],
['2013-02-01','2017-02-07','D',375.0],
['2017-02-08','2019-01-14','D',375.0],
['2019-01-15','2019-04-29','D',375.0],
['2019-04-30','2020-03-10','D',375.0]],
columns=['StartDate','EndDate','Company','Location'])
df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df.set_index('Company', inplace=True)
df['row'] = range(len(df))
print(df)
starts = df[['StartDate', 'Location', 'row']].rename(columns={'StartDate': 'Date'})
ends = df[['EndDate', 'Location', 'row']].rename(columns={'EndDate':'Date'})
df_decomp = pd.concat([starts, ends])
df_decomp = df_decomp.set_index('row', append=True)
df_decomp.sort_index()
print(df_decomp)
df_decomp = df_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('Date').resample('D').fillna(method='pad'))
df_decomp = df_decomp.reset_index(level=1, drop=True)
print(df_decomp.loc['D'])
StartDate   EndDate Company Location
0   1/15/2019   1/31/2019   A   121
1   2/1/2019    3/10/2020   A   136
2   10/2/2006   3/10/2020   B   136
3   7/31/2003   3/10/2020   B   321
4   11/3/2010   3/10/2020   C   322
5   2/7/2017    2/7/2017    D   375
6   2/8/2017    1/14/2019   D   375
7   1/15/2019   4/29/2019   D   375
8   4/30/2019   3/10/2020   D   375

不确定出了什么问题,但复制上面的并运行下面的:

import pandas as pd
df = pd.read_clipboard()

然后,除了你链接的帖子外,还放两行代码:

df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])

运行以下程序,它应该可以工作:

df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df['row'] = range(len(df))
starts = df[['StartDate', 'Location', 'Company', 'row']].rename(columns={'StartDate': 'date'})
ends = df[['EndDate', 'Location', 'Company', 'row']].rename(columns={'EndDate':'date'})
df_decomp = pd.concat([starts, ends]).drop_duplicates()
df_decomp = df_decomp.set_index('row', append=True)
df_decomp.sort_index()
df_decomp = df_decomp.groupby(level=[0,1]).apply(lambda x: 
x.set_index('date').resample('D').fillna(method='pad'))
df_decomp = df_decomp.reset_index(level=1, drop=True)
df_decomp

最新更新