如何将嵌套的JSON解析为CSV



我有一个新项目,我从REST API获取JSON数据-我正在尝试将这些数据解析为csv管道分隔,以导入我们的遗留软件我似乎无法正确解析所有的值对——这是我第一次接触JSON,我尝试了很多东西,但每次都只得到一点正确的

我使用过Python,可以获得一些我需要的项目,但不能获得整个JSON树——它看起来是一个列表,其中也有一些字典和列表我知道我的代码是不完整的,只是在找人给我指正确的方向,在python中什么工具可以完成任务

import json
import csv
with open('tenants.json') as access_json:
read_content = json.load(access_json)

for rm_access in read_content:
rm_data = rm_access
print(rm_data)
contacts_data = rm_data['Contacts']
leases_data = rm_data['Leases']
udfs_data = rm_data['UserDefinedValues']
for contacts_access in contacts_data:
rm_contacts = contacts_access

更新:

import pandas as pd
with open('tenants.json') as access_json:
read_content = json.load(access_json)
for rm_access in read_content:
rm_data = rm_access
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 150)
TenantID = []
TenantDisplayID = []
Name = []
FirstName = []
LastName = []
WebMessage = []
Comment = []
RentDueDay = []
RentPeriod = []
FirstContact = []
PropertyID = []
PostingStartDate = []
CreateDate = []
CreateUserID = []
UpdateDate = []
UpdateUserID = []
Contacts = []
for rm_access in read_content:
rm_data = rm_access
TenantID.append(rm_data["TenantID"])
TenantDisplayID.append(rm_data["TenantDisplayID"])
Name.append(rm_data["Name"])
FirstName.append(rm_data["FirstName"])
LastName.append(rm_data["LastName"])
WebMessage.append(rm_data["WebMessage"])
Comment.append(rm_data["Comment"])
RentDueDay.append(rm_data["RentDueDay"])
RentPeriod.append(rm_data["RentPeriod"])
#    FirstContact.append(rm_data["FirstContact"])
PropertyID.append(rm_data["PropertyID"])
PostingStartDate.append(rm_data["PostingStartDate"])
CreateDate.append(rm_data["CreateDate"])
CreateUserID.append(rm_data["CreateUserID"])
UpdateUserID.append(rm_data["UpdateUserID"])
Contacts.append(rm_data["Contacts"])

df = pd.DataFrame({"TenantID":TenantID,"TenantDisplayID":TenantDisplayID, "Name"
: Name,"FirstName":FirstName, "LastName": LastName,"WebMessage": WebMessage,"Com
ment": Comment, "RentDueDay": RentDueDay, "RentPeriod": RentPeriod, "PropertyID"
: PropertyID, "PostingStartDate": PostingStartDate,"CreateDate": CreateDate, "Cr
eateUserID": CreateUserID,"UpdateUserID": UpdateUserID,"Contacts": Contacts})
print(df)

这是文件的样本

[
{
"TenantID": 115,
"TenantDisplayID": 115,
"Name": "Jane Doe",
"FirstName": "Jane",
"LastName": "Doe",
"WebMessage": "",
"Comment": "",
"RentDueDay": 1,
"RentPeriod": "Monthly",
"FirstContact": "2015-11-01T15:30:00",
"PropertyID": 17,
"PostingStartDate": "2010-10-01T00:00:00",
"CreateDate": "2014-04-16T13:35:37",
"CreateUserID": 1,
"UpdateDate": "2017-03-22T11:31:48",
"UpdateUserID": 1,
"Contacts": [
{
"ContactID": 128,
"FirstName": "Jane",
"LastName": "Doe",
"MiddleName": "",
"IsPrimary": true,
"DateOfBirth": "1975-02-27T00:00:00",
"FederalTaxID": "111-11-1111",
"Comment": "",
"Email": "jane.doe@mail.com",
"License": "ZZT4532",
"Vehicle": "BMW 3 Series",
"IsShowOnBill": true,
"Employer": "REW",
"ApplicantType": "Applicant",
"CreateDate": "2014-04-16T13:35:37",
"CreateUserID": 1,
"UpdateDate": "2017-03-22T11:31:48",
"AnnualIncome": 0.0,
"UpdateUserID": 1,
"ParentID": 115,
"ParentType": "Tenant",
"PhoneNumbers": [
{
"PhoneNumberID": 286,
"PhoneNumberTypeID": 2,
"PhoneNumber": "703-555-5610",
"Extension": "",
"StrippedPhoneNumber": "7035555610",
"IsPrimary": true,
"ParentID": 128,
"ParentType": "Contact"
}
]
}
],
"UserDefinedValues": [
{
"UserDefinedValueID": 1,
"UserDefinedFieldID": 4,
"ParentID": 115,
"Name": "Emerg Contact Name",
"Value": "Terry Harper",
"UpdateDate": "2016-01-22T15:41:53",
"FieldType": "Text",
"UpdateUserID": 2,
"CreateUserID": 2
},
{
"UserDefinedValueID": 174,
"UserDefinedFieldID": 5,
"ParentID": 115,
"Name": "Emerg Contact Phone",
"Value": "703-555-3568",
"UpdateDate": "2016-01-22T15:42:03",
"FieldType": "Text",
"UpdateUserID": 2,
"CreateUserID": 2
}
],
"Leases": [
{
"LeaseID": 115,
"TenantID": 115,
"UnitID": 181,
"PropertyID": 17,
"MoveInDate": "2010-10-01T00:00:00",
"SortOrder": 1,
"CreateDate": "2014-04-16T13:35:37",
"UpdateDate": "2017-03-22T11:31:48",
"CreateUserID": 1,
"UpdateUserID": 1
}
],
"Addresses": [
{
"AddressID": 286,
"AddressTypeID": 1,
"Address": "14393 Montgomery Road Lot #102rnCincinnati, OH 45122",
"Street": "14393 Montgomery Road Lot #102",
"City": "Cincinnati",
"State": "OH",
"PostalCode": "45122",
"IsPrimary": true,
"ParentID": 115,
"ParentType": "Tenant"
}
],
"OpenReceivables": [],
"Status": "Current"
},

并非所有租户都会拥有所有元素,这也是一个棘手的

我需要顶部有TenantID、TenantDisplayID等的数据我还需要来自Contacts、PhoneNumbers、Leases等值的数据每一行都应该是静态的,所以如果它没有特定的标记,那么我想要一个Null或None,这样它看起来就像TentantID|TenantDisplayID|FirstName…等,因此每行具有相同数量的字段

类似的东西应该可以工作:

import pandas as pd 
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 100000)
TenantID = []
TenantDisplayID = []
Name = []
FirstName = []
LastName = []
WebMessage = []
Comment = []
RentDueDay = []
RentPeriod = []
FirstContact = []
PropertyID = []
PostingStartDate = []
CreateDate = []
CreateUserID = []
UpdateDate = []
UpdateUserID = []
Contacts = []
for rm_access in read_content:
rm_data = rm_access
print(rm_data)
TenantID.append(rm_data["TenantID"])
TenantDisplayID.append(rm_data["TenantDisplayID"])
Name.append(rm_data["Name"])
FirstName.append(rm_data["FirstName"])
LastName.append(rm_data["LastName"])
WebMessage.append(rm_data["WebMessage"])
Comment.append(rm_data["Comment"])
RentDueDay.append(rm_data["RentDueDay"])
RentPeriod.append(rm_data["RentPeriod"])
FirstContact.append(rm_data["FirstContact"])
PropertyID.append(rm_data["PropertyID"])
PostingStartDate.append(rm_data["PostingStartDate"])
CreateDate.append(rm_data["CreateDate"])
CreateUserID.append(rm_data["CreateUserID"])
UpdateUserID.append(rm_data["UpdateUserID"])
Contacts.append(rm_data["Contacts"])

df = pd.DataFrame({"TenantID":TenantID,"TenantDisplayID":TenantDisplayID, "Name": Name,
"FirstName":FirstName, "LastName": LastName,"WebMessage": WebMessage,
"Comment": Comment, "RentDueDay": RentDueDay, "RentPeriod": RentPeriod,
"FirstContact": FirstContact, "PropertyID": PropertyID, "PostingStartDate": PostingStartDate,
"CreateDate": CreateDate, "CreateUserID": CreateUserID,"UpdateUserID": UpdateUserID,
"Contacts": Contacts})
print(df)

一般问题

这个任务(以及其他类似任务)的问题不仅仅是如何创建算法——我相信理论上你可以通过(不那么)大量的嵌套for循环来解决这个问题。问题是以一种不会让你头疼的方式来组织代码,也就是说,你可以很容易地修复错误,你可以编写单元测试,你可以从阅读中很容易地理解代码(从现在起的六个月内),并且你可以在需要时很容易地更改代码。我不知道有谁在把头绕在一个嵌套很深的结构上时不会犯错误。在一个嵌套严重的代码中寻找bug,因为它反映了数据的嵌套结构,这可能会非常令人沮丧。

快速(很可能是:最佳)解决方案

依赖为您的确切用例而制作的包,如

https://github.com/cwacek/python-jsonschema-objects

如果您有API模式的正式定义,您可以使用包来实现。例如,如果API具有Swagger模式定义,则不能使用swagger-py(https://github.com/digium/swagger-py)将JSON响应转换为Python对象。

原理解决方案:面向对象编程与递归

即使可能有一些库用于您的具体用例,我也想解释一下如何处理"那种"任务的原理:

针对这类问题组织代码的一个好方法是使用面向对象编程。通过使用递归的原理,可以更清楚地解决嵌套问题。这也使访问代码变得更容易,以防API响应的JSON模式因任何原因而更改(例如,API的更新)。在您的情况下,我建议您创建以下内容:

class JsonObject:
"""Parent Class for any Object that will be retrieved from the JSON
and potentially has nested JsonObjects inside.
This class takes care of parsing the json into python Objects and deals
with the recursion into the nested structures."""
primitives = []
json_objects = {
# For each class, this dict defines all the "embedded" classes which
# live directly "under" that class in the nested JSON. It will have the
# following structure:
# attribute_name : class
# In your case the JSON schema does not have any "single" objects
# in the nesting strcuture, but only lists of nested objects. I
# still , to demonstrate how you would do it in case, there would be
# single "embedded"
}
json_object_lists = {
# For each class, this dict defines all the "embedded" subclasses which
# are provided in a list "under" that class in the nested JSON.
# It will have the following structure:
# attribute_name : class
}
@classmethod
def from_dict(cls, d: dict) -> "JsonObject":
instance = cls()
for attribute in cls.primitives:
# Here we just parse all the primitives
instance.attribute = getattr(d, attribute, None)
for attribute, klass in cls.json_object_lists.items():
# Here we parse all lists of embedded JSON Objects
nested_objects = []
l = getattr(d, attribute, [])
for nested_dict in l:
nested_objects += klass.from_dict(nested_dict)
setattr(instance, attribute, nested_objects)
for attribute, klass in cls.json_objects.items():
# Here we parse all "single" embedded JSON Objects
setattr(
instance,
attribute,
klass.from_dict(getattr(d, attribute, None)
)
def to_csv(self) -> str:
pass

由于您没有解释如何从JSON创建csv,所以我没有实现该方法,而是将其留给您。也没有必要解释总体方法。

现在我们有了通用的Parent类,所有特定的都将从中继承,这样我们就可以将递归应用于我们的问题。现在我们只需要根据我们想要解析的JSON模式来定义这些具体的结构。我从你的样品中得到了以下内容,但你可以很容易地改变你需要的东西:

class Address(JsonObject):
primitives = [
"AddressID",
"AddressTypeID",
"Address",
"Street",
"City",
"State",
"PostalCode",
"IsPrimary",
"ParentID",
"ParentType",
]
json_objects = {}
json_object_lists = {}

class Lease(JsonObject):
primitives = [
"LeaseID",
"TenantID",
"UnitID",
"PropertyID",
"MoveInDate",
"SortOrder",
"CreateDate",
"UpdateDate",
"CreateUserID",
"UpdateUserID",
]
json_objects = {}
json_object_lists = {}

class UserDefinedValue(JsonObject):
primitives = [
"UserDefinedValueID",
"UserDefinedFieldID",
"ParentID",
"Name",
"Value",
"UpdateDate",
"FieldType",
"UpdateUserID",
"CreateUserID",
]
json_objects = {}
json_object_lists = {}

class PhoneNumber(JsonObject):
primitives = [
"PhoneNumberID",
"PhoneNumberTypeID",
"PhoneNumber",
"Extension",
"StrippedPhoneNumber",
"IsPrimary",
"ParentID",
"ParentType",
]
json_objects = {}
json_object_lists = {}
class Contact(JsonObject):
primitives = [
"ContactID",
"FirstName",
"LastName",
"MiddleName",
"IsPrimary",
"DateOfBirth",
"FederalTaxID",
"Comment",
"Email",
"License",
"Vehicle",
"IsShowOnBill",
"Employer",
"ApplicantType",
"CreateDate",
"CreateUserID",
"UpdateDate",
"AnnualIncome",
"UpdateUserID",
"ParentID",
"ParentType",
]
json_objects = {}
json_object_lists = {
"PhoneNumbers": PhoneNumber,
}

class Tenant(JsonObject):
primitives = [
"TenantID",
"TenantDisplayID",
"Name",
"FirstName",
"LastName",
"WebMessage",
"Comment",
"RentDueDay",
"RentPeriod",
"FirstContact",
"PropertyID",
"PostingStartDate",
"CreateDate",
"CreateUserID",
"UpdateDate",
"UpdateUserID",
"OpenReceivables",  # Maybe this is also a nested Object? Not clear from your sample.
"Status",
]
json_object_lists = {
"Contacts": Contact,
"UserDefinedValues": UserDefinedValue,
"Leases": Lease,
"Addresses": Address,
}
json_objects = {}

你可能会想象这种方法的"美妙"(至少:顺序),它在于以下内容:使用这种结构,我们可以在API的JSON响应中处理任何级别的嵌套,而不会引起额外的头痛-我们的代码不会加深其缩进级别,因为我们已经将讨厌的嵌套分离到JsonObjectfrom_json方法的递归定义中。这就是为什么现在识别错误或对代码进行更改要容易得多。

要最终将JSON解析为我们的对象,您需要执行以下操作:

import typing
import json

def tenants_from_json(json_string: str) -> typing.Iterable["Tenant"]:
tenants = [
Tenant.from_dict(tenant_dict)
for tenant_dict in json.loads(json_string)
]
return tenants

最后重要提示:这只是基本原则

我的代码示例只是一个非常简短的介绍,介绍了使用对象和递归来处理结构的压倒性(和讨厌)嵌套的想法。该代码有一些缺陷。例如,应该避免定义可变的类变量。当然,整个代码应该验证它从API获得的数据。您可能还想添加每个属性的类型,并在Python对象中正确地表示它(例如,您的示例有整数、日期时间和字符串)。

我真的只想在这里向你展示面向对象编程的原理。

我没有花时间测试我的代码。所以可能还有漏洞。再说一遍,我只是想证明这个原则。

最新更新