如何使这段代码不消耗这么多RAM内存?



我有这两个函数,当我运行它们时,我的内核很快就死了。我能做些什么来预防呢?在向数据框追加大约10个文件后发生。不幸的是,json文件太大了。每个150mb,有几十个),我不知道如何将它们连接在一起。

import os
import pandas as pd
from pandas.io.json import json_normalize
import json
def filtering_nodes(df):
id_list = df.index.tolist()
print("Dropping rows without 4 nodes and 3 members...")
for x in id_list:
if len(df['Nodes'][x]) != 4 and len(df['Members'][x]) != 3:
df = df.drop(x)
print("Converting to csv...")
df.to_csv("whole_df.csv", sep='t')
return df
def merge_JsonFiles(filename):
result = list()
cnt = 0

df_all = None
data_all = None

for f1 in filename:
print("Appending file: ", f1)
with open('../../data' + f1, 'r') as infile:
data_all = json.loads(infile.read())
if cnt == 0:
df_all = pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-")
else:
df_all = df_all.append(pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-"), ignore_index = True)
cnt += 1

return df_all
files = os.listdir('../../data')
df_all_test = merge_JsonFiles(files)
df_all_test_drop = filtering_nodes(df_all_test)

编辑:由于@jlandercy的回答,我做了这个:

def merging_to_csv():
for path in pathlib.Path("../../data/loads_data/Dane/hilti/").glob("*.json"):
# Open source file one by one:
with path.open() as handler:
df = pd.json_normalize(json.load(handler), record_path =['List2D'])
# Identify rows to drop (boolean indexing):
q = (df["Nodes"] != 4) & (df["Members"] != 3)
# Inplace drop (no extra copy in RAM):
df.drop(q, inplace=True)
# Append data to disk instead of RAM:
df.to_csv("output.csv", mode="a", header=False)
merging_to_csv()

,我有这种类型的错误:

KeyError                                  Traceback (most recent call last)
<ipython-input-55-cf18265ca50e> in <module>
----> 1 merging_to_csv()
<ipython-input-54-698c67461b34> in merging_to_csv()
51         q = (df["Nodes"] != 4) & (df["Members"] != 3)
52         # Inplace drop (no extra copy in RAM):
---> 53         df.drop(q, inplace=True)
54         # Append data to disk instead of RAM:
55         df.to_csv("output.csv", mode="a", header=False)
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309                     stacklevel=stacklevel,
310                 )
--> 311             return func(*args, **kwargs)
312 
313         return wrapper
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4906             level=level,
4907             inplace=inplace,
-> 4908             errors=errors,
4909         )
4910 
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4148         for axis, labels in axes.items():
4149             if labels is not None:
-> 4150                 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4151 
4152         if inplace:
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
4183                 new_axis = axis.drop(labels, level=level, errors=errors)
4184             else:
-> 4185                 new_axis = axis.drop(labels, errors=errors)
4186             result = self.reindex(**{axis_name: new_axis})
4187 
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
6016         if mask.any():
6017             if errors != "ignore":
-> 6018                 raise KeyError(f"{labels[mask]} not found in axis")
6019             indexer = indexer[~mask]
6020         return self.delete(indexer)
KeyError: '[ True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True  True  True  True  True  True  True  True  True  True  True  Truen  True] not found in axis'

怎么了?我将在这里上传两个最小的json文件:https://drive.google.com/drive/folders/1xlC-kK6NLGr0isdy1Ln2tzGmel45GtPC?usp=sharing

您原来的方法面临多重问题:

  • 的多个副本dataframe:df = df.drop(...);
  • 由于append的存在,全部信息存储在RAM中;
  • 不需要循环来过滤行,使用布尔索引代替。

下面是基于您提供的数据样本来解决问题的基线代码片段:

import json
import pathlib
import pandas as pd

# Iterate source files:
for path in pathlib.Path(".").glob("result*.json"):
# Open source file one by one:
with path.open() as handler:
# Normalize JSON model:
df = pd.json_normalize(json.load(handler), record_path =['List2D'], max_level=2, sep="-")
# Apply len to list fields to identify rows to drop (boolean indexing):
q = (df["Nodes"].apply(len) != 4) & (df["Members"].apply(len) != 3)
# Filter and append data to disk instead of RAM:
df.loc[~q,:].to_csv("output.csv", mode="a", header=False)

它在RAM中逐个加载文件,然后将过滤后的行附加到磁盘而不是RAM。这些修复将大大减少内存使用,并应保持高达最大JSON文件的两倍。

最新更新