#这是我的代码
import pandas as pd
import bson
FILE="users_(1).bson"
with open(FILE,'rb') as f:
data = bson.decode_all(f.read())
main_df=pd.DataFrame(data)
main_df.describe()
#This is my .bson file
[{'_id': ObjectId('999f24f260f653401b'),
'isV2': False,
'isBeingMigratedToV2': False,
'firstName': 'Jezz',
'lastName': 'Bezos',
'subscription': {'_id': ObjectId('999f24f260f653401b'),
'chargebeeId': 'AzZdd6T847kHQ',
'currencyCode': 'EUR',
'customerId': 'AzZdd6T847kHQ',
'nextBillingAt': datetime.datetime(2022, 7, 7, 10, 14, 6),
'numberOfMonthsPaid': 1,
'planId': 'booster-v3-eur',
'startedAt': datetime.datetime(2022, 6, 7, 10, 14, 6),
'addons': [],
'campaign': None,
'maskedCardNumber': '************1234'},
'email': 'jeffbezos@gmail.com',
'groupName': None,
'username': 'jeffbezy',
'country': 'DE'},
{'_id': ObjectId('999f242660f653401b'),
'isV2': False,
'isBeingMigratedToV2': False,
'firstName': 'Caterina',
'lastName': 'Fake',
'subscription': {'_id': ObjectId('999f242660f653401b'),
'chargebeeId': '16CGLYT846t99',
'currencyCode': 'GBP',
'customerId': '16CGLYT846t99',
'nextBillingAt': datetime.datetime(2022, 7, 7, 10, 10, 41),
'numberOfMonthsPaid': 1,
'planId': 'personal-v3-gbp',
'startedAt': datetime.datetime(2022, 6, 7, 10, 10, 41),
'addons': [],
'campaign': None,
'maskedCardNumber': '************4311'},
'email': 'caty.fake@gmail.com',
'groupName': None,
'username': 'cfake',
'country': 'GB'}]
我得到错误
'bson.errors.InvalidBSON: objsize too large'
与日期时间有关吗?这是。bson文件的结构,已经在这几个小时,似乎不能看到错误。我知道如何使用json并试图将其转换为json,但没有成功。如有任何建议,我将不胜感激。
如果这里的主要目标是将数据读取到pandas DataFrame中,您确实可以将数据格式化为json并使用bson.json_util.loads
:
import pandas as pd
from bson.json_util import loads
with open(filepath,'r') as f:
data = f.read()
mapper = {
''': '"', # using double quotes
'False': 'false',
'None': '"None"', # double quotes around None
# modifying the ObjectIds and timestamps
'("': '(',
'")': ')',
')': ')"',
'ObjectId': '"ObjectId',
'datetime.datetime': '"datetime.datetime'
}
for k, v in mapper.items():
data = data.replace(k, v)
data = loads(data)
df = pd.DataFrame(data)