我正在尝试规范化以下JSON:
{
"href": "url1",
"items": [
{
"count": 6,
"items": [
{
"href": "url2",
"count": 2,
"items": [
{
"href": "url3",
"description": {
"year": 2017,
"make": "ACURA",
"model": "ILX",
"trim": "4D SEDAN ACURAWATCH PLUS",
"subSeries": "ACURAWATCH"
},
"adjustedPricing": {
"wholesale": {
"above": 18050,
"average": 15650,
"below": 13250
},
"adjustedBy": {
"Odometer": "30352"
}
},
"wholesale": {
"above": 17600,
"average": 15200,
"below": 12800
},
"averageOdometer": 36939,
"odometerUnits": "miles",
"averageGrade": 43,
"currency": "USD",
"sampleSize": "6"
},
{
"href": "url4",
"description": {
"year": 2017,
"make": "ACURA",
"model": "ILX",
"trim": "4D SEDAN",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 17950,
"average": 16600,
"below": 15250
},
"adjustedBy": {
"Odometer": "30352"
}
},
"wholesale": {
"above": 17900,
"average": 16550,
"below": 15200
},
"averageOdometer": 31235,
"odometerUnits": "miles",
"averageGrade": 37,
"currency": "USD",
"sampleSize": "47"
}
]
},
{
"href": "url5",
"count": 2,
"items": [
{
"href": "url6",
"description": {
"year": 2017,
"make": "ACURA",
"model": "ILX",
"trim": "4D SEDAN PREMIUM",
"subSeries": "PREMIUM"
},
"adjustedPricing": {
"wholesale": {
"above": 16200,
"average": 14700,
"below": 13200
},
"adjustedBy": {
"Odometer": "53824"
}
},
"wholesale": {
"above": 18500,
"average": 16950,
"below": 15450
},
"averageOdometer": 30090,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "15"
},
{
"href": "url7",
"description": {
"year": 2017,
"make": "ACURA",
"model": "ILX",
"trim": "4D SEDAN TECH PKG",
"subSeries": "TECH PKG"
},
"adjustedPricing": {
"wholesale": {
"above": 17850,
"average": 15950,
"below": 14050
},
"adjustedBy": {
"Odometer": "53824"
}
},
"wholesale": {
"above": 19800,
"average": 17850,
"below": 15950
},
"averageOdometer": 27247,
"odometerUnits": "miles",
"averageGrade": 37,
"currency": "USD",
"sampleSize": "21"
}
]
},
{
"href": "url8",
"count": 1,
"items": [
{
"href": "url9",
"description": {
"year": 2017,
"make": "ACURA",
"model": "MDX AWD",
"trim": "4D SUV 3.5L",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 28900,
"average": 27400,
"below": 25900
},
"adjustedBy": {
"Odometer": "23753"
}
},
"wholesale": {
"above": 28000,
"average": 26500,
"below": 25000
},
"averageOdometer": 34427,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "79",
"bestMatch": true
}
]
},
{
"href": "url10",
"count": 1,
"items": [
{
"href": "url11",
"description": {
"year": 2017,
"make": "ACURA",
"model": "MDX AWD",
"trim": "4D SUV 3.5L",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 27200,
"average": 25700,
"below": 24200
},
"adjustedBy": {
"Odometer": "43735"
}
},
"wholesale": {
"above": 28000,
"average": 26500,
"below": 25000
},
"averageOdometer": 34427,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "79",
"bestMatch": true
}
]
},
{
"href": "url12",
"count": 1,
"items": [
{
"href": "url13",
"description": {
"year": 2017,
"make": "ACURA",
"model": "MDX AWD",
"trim": "4D SUV 3.5L",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 27700,
"average": 26200,
"below": 24700
},
"adjustedBy": {
"Odometer": "38097"
}
},
"wholesale": {
"above": 28000,
"average": 26500,
"below": 25000
},
"averageOdometer": 34427,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "79",
"bestMatch": true
}
]
},
{
"href": "url14",
"count": 1,
"items": [
{
"href": "url15",
"description": {
"year": 2017,
"make": "ACURA",
"model": "MDX AWD",
"trim": "4D SUV 3.5L",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 28100,
"average": 26500,
"below": 25000
},
"adjustedBy": {
"Odometer": "33627"
}
},
"wholesale": {
"above": 28000,
"average": 26500,
"below": 25000
},
"averageOdometer": 34427,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "79",
"bestMatch": true
}
]
}
]
}
],
"errors": [
{
"count": 0,
"errors": []
}
]
}
我尝试过使用Pandas的json_normalize函数,该函数返回1行3列(href,items,errors(。我还尝试将其插入到Normalize中的每一个解决方案中,该解决方案是Panda中嵌套很深的json,但它最终返回了一个包含数百列的数据帧。有人知道如何正确地压平这个JSON,以便将其输入SQL吗?
对于这样的递归结构,您可以使用递归来压平它。
js
是您提供的示例JSON(必须将true
更改为True
(- 如果存在键项,则
pop()
会将其添加到正在生成的list
中。对列表中的所有元素执行递归调用。将其附加到列表中,不使用项键 - 如果键项不存在-一直重复出现,那么只需添加到
list
- 为了提高透明度,我在输出中包含了递归深度和指示器,如果展开的项是否有子项
def flat(js, l, level=0):
i = {}
if "items" in js.keys():
i = js.pop("items")
for js2 in i:
flat(js2, l, level+1)
l.append({**{"level":level, "haschild":True}, **js})
else:
l.append({**{"level":level, "haschild":False}, **js})
return l
l = []
l =flat(js, l)
df = pd.json_normalize(l)
print(df.iloc[:,:10].to_string(index=False))
样本输出
level haschild href averageOdometer odometerUnits averageGrade currency sampleSize description.year description.make
3 False url3 36939.0 miles 43.0 USD 6 2017.0 ACURA
3 False url4 31235.0 miles 37.0 USD 47 2017.0 ACURA
2 True url2 NaN NaN NaN NaN NaN NaN NaN
3 False url6 30090.0 miles 36.0 USD 15 2017.0 ACURA
3 False url7 27247.0 miles 37.0 USD 21 2017.0 ACURA
2 True url5 NaN NaN NaN NaN NaN NaN NaN
3 False url9 34427.0 miles 36.0 USD 79 2017.0 ACURA
2 True url8 NaN NaN NaN NaN NaN NaN NaN
3 False url11 34427.0 miles 36.0 USD 79 2017.0 ACURA
2 True url10 NaN NaN NaN NaN NaN NaN NaN
3 False url13 34427.0 miles 36.0 USD 79 2017.0 ACURA
2 True url12 NaN NaN NaN NaN NaN NaN NaN
3 False url15 34427.0 miles 36.0 USD 79 2017.0 ACURA
2 True url14 NaN NaN NaN NaN NaN NaN NaN
1 True NaN NaN NaN NaN NaN NaN NaN NaN
0 True url1 NaN NaN NaN NaN NaN NaN NaN