我是ipython和pandas 的新手
当我运行pd时,交叉表(df[A'],df['B'](。它有错误内存错误
数据帧有10000000行。我认为数据的大小可能太大了。
我用df.values.nbytes+df.index.nbytes+df.columns.nbytes检查数据帧的大小
内存只有381 MB。我的服务器有16GB内存
如果我运行具有1000000行的数据帧,就没有问题。
我希望有人能帮忙。
错误的调试日志:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-6-199f99c3064f> in <module>()
99 df = df.applymap(lambda x: np.nan if str(x) == "N/A" or len(str(x).strip()) == 0 else x)
100
--> 101 summary_table(df)
<ipython-input-6-199f99c3064f> in summary_table(df)
78 dis_for_cont_vars(df)
79
---> 80 value_count(df)
81 #END summary_table
82
<ipython-input-6-199f99c3064f> in value_count(df)
63 def value_count(df):
64 print "===> Value countsn"
---> 65 print pd.crosstab(df['A'], df['B'])
66 print "===>n"
67
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/tools/pivot.pyc in crosstab(rows, cols, values, rownames, colnames, aggfunc, margins, dropna)
368 df['__dummy__'] = 0
369 table = df.pivot_table('__dummy__', rows=rownames, cols=colnames,
--> 370 aggfunc=len, margins=margins, dropna=dropna)
371 return table.fillna(0).astype(np.int64)
372 else:
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/tools/pivot.pyc in pivot_table(data, values, rows, cols, aggfunc, fill_value, margins, dropna)
108 to_unstack = [agged.index.names[i]
109 for i in range(len(rows), len(keys))]
--> 110 table = agged.unstack(to_unstack)
111
112 if not dropna:
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in unstack(self, level)
3211 """
3212 from pandas.core.reshape import unstack
-> 3213 return unstack(self, level)
3214
3215 #----------------------------------------------------------------------
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in unstack(obj, level)
416 def unstack(obj, level):
417 if isinstance(level, (tuple, list)):
--> 418 return _unstack_multiple(obj, level)
419
420 if isinstance(obj, DataFrame):
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in _unstack_multiple(data, clocs)
316 columns=data.columns)
317
--> 318 unstacked = dummy.unstack('__placeholder__')
319 if isinstance(unstacked, Series):
320 unstcols = unstacked.index
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in unstack(self, level)
3211 """
3212 from pandas.core.reshape import unstack
-> 3213 return unstack(self, level)
3214
3215 #----------------------------------------------------------------------
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in unstack(obj, level)
420 if isinstance(obj, DataFrame):
421 if isinstance(obj.index, MultiIndex):
--> 422 return _unstack_frame(obj, level)
423 else:
424 return obj.T.stack(dropna=False)
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in _unstack_frame(obj, level)
459 unstacker = _Unstacker(obj.values, obj.index, level=level,
460 value_columns=obj.columns)
--> 461 return unstacker.get_result()
462
463
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in get_result(self)
141 # TODO: find a better way than this masking business
142
--> 143 values, value_mask = self.get_new_values()
144 columns = self.get_new_columns()
145 index = self.get_new_index()
/home/deploy/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in get_new_values(self)
185 else:
186 dtype, fill_value = _maybe_promote(values.dtype)
--> 187 new_values = np.empty(result_shape, dtype=dtype)
188 new_values.fill(fill_value)
189
MemoryError:
这个问题很古老,但对于任何有同样问题的人来说:遇到存储器错误的原因是因为得到的表的形状是(m x n(,其中m是df中唯一值的数目。A和n是df中唯一值的数目。B所以它可以变得很大。
为了避免这种情况,你可以尝试使用sklearn的DictVectorizer,它可以做一些类似于交叉表的事情(例如,可以用来伪编码分类特征(,但它会生成一个稀疏矩阵,更适合内存。
Yo可以使用scipy.sparse.coo_matrix,这是一个存储(i,j,v(为i df[a]索引、j df[B]索引及其值的矩阵。(如果是交叉表,则为1(。
实例化这个矩阵的最后一种方法可能是你的方法,你必须在dict或另一个变量中跟踪你的标签索引,但它速度极快,内存价格合理。
你必须小心,你不再有DataFrame,而是一个numpy数组。