我有一个这样的数据帧*EDITED
StartDate EndDate Company Location
2019-01-15 2019-01-31 1.0 121.0
2019-02-01 2020-03-10 1.0 136.0
2006-10-02 2020-03-10 2.0 136.0
2003-07-31 2020-03-10 2.0 321.0
2010-11-03 2020-03-10 3.0 322.0
2013-02-01 2017-02-07 4.0 375.0
2017-02-08 2019-01-14 4.0 375.0
2019-01-15 2019-04-29 4.0 375.0
2019-04-30 2020-03-10 4.0 375.0
如本链接所述:Pandas:将日期范围解压缩为单个日期我想把它解压到只有一个字段,那就是日期。我一步一步地找到了解决方案。然而,当我尝试用重采样进行分组时,我会得到以下错误:ValueError: cannot reindex a non-unique index with a method or limit
发生这种情况的原因是什么?
更清楚地说,这是我的代码(原始数据帧的索引只是正常的索引1、2、3…
df=read_parquet('company_location.parquet')
df=df[['COMPANY','STARTDATE','ENDDATE','LOCATION']]
df['STARTDATE']=pd.to_datetime(df['STARTDATE'])
df['ENDDATE']=pd.to_datetime(df['ENDDATE'])
df=df.dropna(axis=0,how='any')
df['rows']=range(len(df))
starts=df[['COMPANY','STARTDATE','LOCATION','rows']].rename(columns={'STARTDATE':'DATE'})
ends=df[['COMPANY','ENDDATE','LOCATION','rows']].rename(columns={'ENDDATE':'DATE'})
df_decomp=pd.concat([starts,ends])
df_decomp=df_decomp.set_index('rows', append=True)
df_decomp.sort_index()
到这里之前一切都很好。
然后当我写这行的时候,出现了一个错误:
df_decomp=df_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))
错误是:(这是Jupyter笔记本(
ValueError Traceback (most recent call last)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
688 try:
--> 689 result = self._python_apply_general(f)
690 except Exception:
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
706 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707 self.axis)
708
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
<ipython-input-29-e5d0ce53cd1c> in <lambda>(x)
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in fillna(self, method, limit)
759 """
--> 760 return self._upsample(method, limit=limit)
761
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
1072 result = obj.reindex(res_index, method=method,
-> 1073 limit=limit, fill_value=fill_value)
1074
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
196 def wrapper(*args, **kwargs):
--> 197 return func(*args, **kwargs)
198
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
3808 kwargs.pop('labels', None)
-> 3809 return super(DataFrame, self).reindex(**kwargs)
3810
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4355 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 4356 fill_value, copy).__finalize__(self)
4357
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3740 frame = frame._reindex_index(index, method, copy, level,
-> 3741 fill_value, limit, tolerance)
3742
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
3748 level=level, limit=limit,
-> 3749 tolerance=tolerance)
3750 return self._reindex_with_indexers({0: [new_index, indexer]},
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
3137 if method is not None or limit is not None:
-> 3138 raise ValueError("cannot reindex a non-unique index "
3139 "with a method or limit")
ValueError: cannot reindex a non-unique index with a method or limit
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-29-e5d0ce53cd1c> in <module>()
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
699
700 with _group_selection_context(self):
--> 701 return self._python_apply_general(f)
702
703 return result
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
705 def _python_apply_general(self, f):
706 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707 self.axis)
708
709 return self._wrap_applied_output(
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
188 # group might be modified
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
192 mutated = True
<ipython-input-29-e5d0ce53cd1c> in <lambda>(x)
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in fillna(self, method, limit)
758 2018-01-01 02:00:00 6.0 5
759 """
--> 760 return self._upsample(method, limit=limit)
761
762 @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
1071 else:
1072 result = obj.reindex(res_index, method=method,
-> 1073 limit=limit, fill_value=fill_value)
1074
1075 result = self._apply_loffset(result)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
195 @wraps(func)
196 def wrapper(*args, **kwargs):
--> 197 return func(*args, **kwargs)
198
199 if not PY2:
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
3807 kwargs.pop('axis', None)
3808 kwargs.pop('labels', None)
-> 3809 return super(DataFrame, self).reindex(**kwargs)
3810
3811 @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4354 # perform the reindex on the axes
4355 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 4356 fill_value, copy).__finalize__(self)
4357
4358 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3739 if index is not None:
3740 frame = frame._reindex_index(index, method, copy, level,
-> 3741 fill_value, limit, tolerance)
3742
3743 return frame
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
3747 new_index, indexer = self.index.reindex(new_index, method=method,
3748 level=level, limit=limit,
-> 3749 tolerance=tolerance)
3750 return self._reindex_with_indexers({0: [new_index, indexer]},
3751 copy=copy, fill_value=fill_value,
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
3136 else:
3137 if method is not None or limit is not None:
-> 3138 raise ValueError("cannot reindex a non-unique index "
3139 "with a method or limit")
3140 indexer, missing = self.get_indexer_non_unique(target)
ValueError: cannot reindex a non-unique index with a method or limit
我能够遵循Pandas:将日期范围解压缩到各个日期,并且我在您的数据集上没有得到任何错误。请参阅答案中的以下代码
import pandas as pd
df = pd.DataFrame([['2019-01-15','2019-01-31','A',121.0],
['2019-02-01','2020-03-10','A',136.0],
['2006-10-02','2020-03-10','B',136.0],
['2003-07-31','2020-03-10','B',321.0],
['2010-11-03','2020-03-10','C',322.0],
['2013-02-01','2017-02-07','D',375.0],
['2017-02-08','2019-01-14','D',375.0],
['2019-01-15','2019-04-29','D',375.0],
['2019-04-30','2020-03-10','D',375.0]],
columns=['StartDate','EndDate','Company','Location'])
df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df.set_index('Company', inplace=True)
df['row'] = range(len(df))
print(df)
starts = df[['StartDate', 'Location', 'row']].rename(columns={'StartDate': 'Date'})
ends = df[['EndDate', 'Location', 'row']].rename(columns={'EndDate':'Date'})
df_decomp = pd.concat([starts, ends])
df_decomp = df_decomp.set_index('row', append=True)
df_decomp.sort_index()
print(df_decomp)
df_decomp = df_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('Date').resample('D').fillna(method='pad'))
df_decomp = df_decomp.reset_index(level=1, drop=True)
print(df_decomp.loc['D'])
StartDate EndDate Company Location
0 1/15/2019 1/31/2019 A 121
1 2/1/2019 3/10/2020 A 136
2 10/2/2006 3/10/2020 B 136
3 7/31/2003 3/10/2020 B 321
4 11/3/2010 3/10/2020 C 322
5 2/7/2017 2/7/2017 D 375
6 2/8/2017 1/14/2019 D 375
7 1/15/2019 4/29/2019 D 375
8 4/30/2019 3/10/2020 D 375
不确定出了什么问题,但复制上面的并运行下面的:
import pandas as pd
df = pd.read_clipboard()
然后,除了你链接的帖子外,还放两行代码:
df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
运行以下程序,它应该可以工作:
df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df['row'] = range(len(df))
starts = df[['StartDate', 'Location', 'Company', 'row']].rename(columns={'StartDate': 'date'})
ends = df[['EndDate', 'Location', 'Company', 'row']].rename(columns={'EndDate':'date'})
df_decomp = pd.concat([starts, ends]).drop_duplicates()
df_decomp = df_decomp.set_index('row', append=True)
df_decomp.sort_index()
df_decomp = df_decomp.groupby(level=[0,1]).apply(lambda x:
x.set_index('date').resample('D').fillna(method='pad'))
df_decomp = df_decomp.reset_index(level=1, drop=True)
df_decomp