在Python中规范化深度嵌套的JSON错误地产生了数百列



我正在尝试规范化以下JSON:

{
"href": "url1",
"items": [
{
"count": 6,
"items": [
{
"href": "url2",
"count": 2,
"items": [
{
"href": "url3",
"description": {
"year": 2017,
"make": "ACURA",
"model": "ILX",
"trim": "4D SEDAN ACURAWATCH PLUS",
"subSeries": "ACURAWATCH"
},
"adjustedPricing": {
"wholesale": {
"above": 18050,
"average": 15650,
"below": 13250
},
"adjustedBy": {
"Odometer": "30352"
}
},
"wholesale": {
"above": 17600,
"average": 15200,
"below": 12800
},
"averageOdometer": 36939,
"odometerUnits": "miles",
"averageGrade": 43,
"currency": "USD",
"sampleSize": "6"
},
{
"href": "url4",
"description": {
"year": 2017,
"make": "ACURA",
"model": "ILX",
"trim": "4D SEDAN",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 17950,
"average": 16600,
"below": 15250
},
"adjustedBy": {
"Odometer": "30352"
}
},
"wholesale": {
"above": 17900,
"average": 16550,
"below": 15200
},
"averageOdometer": 31235,
"odometerUnits": "miles",
"averageGrade": 37,
"currency": "USD",
"sampleSize": "47"
}
]
},
{
"href": "url5",
"count": 2,
"items": [
{
"href": "url6",
"description": {
"year": 2017,
"make": "ACURA",
"model": "ILX",
"trim": "4D SEDAN PREMIUM",
"subSeries": "PREMIUM"
},
"adjustedPricing": {
"wholesale": {
"above": 16200,
"average": 14700,
"below": 13200
},
"adjustedBy": {
"Odometer": "53824"
}
},
"wholesale": {
"above": 18500,
"average": 16950,
"below": 15450
},
"averageOdometer": 30090,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "15"
},
{
"href": "url7",
"description": {
"year": 2017,
"make": "ACURA",
"model": "ILX",
"trim": "4D SEDAN TECH PKG",
"subSeries": "TECH PKG"
},
"adjustedPricing": {
"wholesale": {
"above": 17850,
"average": 15950,
"below": 14050
},
"adjustedBy": {
"Odometer": "53824"
}
},
"wholesale": {
"above": 19800,
"average": 17850,
"below": 15950
},
"averageOdometer": 27247,
"odometerUnits": "miles",
"averageGrade": 37,
"currency": "USD",
"sampleSize": "21"
}
]
},
{
"href": "url8",
"count": 1,
"items": [
{
"href": "url9",
"description": {
"year": 2017,
"make": "ACURA",
"model": "MDX AWD",
"trim": "4D SUV 3.5L",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 28900,
"average": 27400,
"below": 25900
},
"adjustedBy": {
"Odometer": "23753"
}
},
"wholesale": {
"above": 28000,
"average": 26500,
"below": 25000
},
"averageOdometer": 34427,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "79",
"bestMatch": true
}
]
},
{
"href": "url10",
"count": 1,
"items": [
{
"href": "url11",
"description": {
"year": 2017,
"make": "ACURA",
"model": "MDX AWD",
"trim": "4D SUV 3.5L",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 27200,
"average": 25700,
"below": 24200
},
"adjustedBy": {
"Odometer": "43735"
}
},
"wholesale": {
"above": 28000,
"average": 26500,
"below": 25000
},
"averageOdometer": 34427,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "79",
"bestMatch": true
}
]
},
{
"href": "url12",
"count": 1,
"items": [
{
"href": "url13",
"description": {
"year": 2017,
"make": "ACURA",
"model": "MDX AWD",
"trim": "4D SUV 3.5L",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 27700,
"average": 26200,
"below": 24700
},
"adjustedBy": {
"Odometer": "38097"
}
},
"wholesale": {
"above": 28000,
"average": 26500,
"below": 25000
},
"averageOdometer": 34427,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "79",
"bestMatch": true
}
]
},
{
"href": "url14",
"count": 1,
"items": [
{
"href": "url15",
"description": {
"year": 2017,
"make": "ACURA",
"model": "MDX AWD",
"trim": "4D SUV 3.5L",
"subSeries": "NONE"
},
"adjustedPricing": {
"wholesale": {
"above": 28100,
"average": 26500,
"below": 25000
},
"adjustedBy": {
"Odometer": "33627"
}
},
"wholesale": {
"above": 28000,
"average": 26500,
"below": 25000
},
"averageOdometer": 34427,
"odometerUnits": "miles",
"averageGrade": 36,
"currency": "USD",
"sampleSize": "79",
"bestMatch": true
}
]
}
]
}
],
"errors": [
{
"count": 0,
"errors": []
}
]
}

我尝试过使用Pandas的json_normalize函数,该函数返回1行3列(href,items,errors(。我还尝试将其插入到Normalize中的每一个解决方案中,该解决方案是Panda中嵌套很深的json,但它最终返回了一个包含数百列的数据帧。有人知道如何正确地压平这个JSON,以便将其输入SQL吗?

对于这样的递归结构,您可以使用递归来压平它。

  1. js是您提供的示例JSON(必须将true更改为True(
  2. 如果存在键,则pop()会将其添加到正在生成的list中。对列表中的所有元素执行递归调用。将其附加到列表中,不使用
  3. 如果键不存在-一直重复出现,那么只需添加到list
  4. 为了提高透明度,我在输出中包含了递归深度和指示器,如果展开的项是否有子项
def flat(js, l, level=0):
i = {}
if "items" in js.keys():
i = js.pop("items")
for js2 in i:
flat(js2, l, level+1)
l.append({**{"level":level, "haschild":True}, **js})
else:
l.append({**{"level":level, "haschild":False}, **js})
return l
l = []
l =flat(js, l)
df = pd.json_normalize(l)
print(df.iloc[:,:10].to_string(index=False))

样本输出

level  haschild   href  averageOdometer odometerUnits  averageGrade currency sampleSize  description.year description.make
3     False   url3          36939.0         miles          43.0      USD          6            2017.0            ACURA
3     False   url4          31235.0         miles          37.0      USD         47            2017.0            ACURA
2      True   url2              NaN           NaN           NaN      NaN        NaN               NaN              NaN
3     False   url6          30090.0         miles          36.0      USD         15            2017.0            ACURA
3     False   url7          27247.0         miles          37.0      USD         21            2017.0            ACURA
2      True   url5              NaN           NaN           NaN      NaN        NaN               NaN              NaN
3     False   url9          34427.0         miles          36.0      USD         79            2017.0            ACURA
2      True   url8              NaN           NaN           NaN      NaN        NaN               NaN              NaN
3     False  url11          34427.0         miles          36.0      USD         79            2017.0            ACURA
2      True  url10              NaN           NaN           NaN      NaN        NaN               NaN              NaN
3     False  url13          34427.0         miles          36.0      USD         79            2017.0            ACURA
2      True  url12              NaN           NaN           NaN      NaN        NaN               NaN              NaN
3     False  url15          34427.0         miles          36.0      USD         79            2017.0            ACURA
2      True  url14              NaN           NaN           NaN      NaN        NaN               NaN              NaN
1      True    NaN              NaN           NaN           NaN      NaN        NaN               NaN              NaN
0      True   url1              NaN           NaN           NaN      NaN        NaN               NaN              NaN

最新更新