在python中创建json数据的可配置代码



我试图创建使用python将平面文件转换为json文件的可配置代码。我有一个多值的电子邮件专栏,将来还会有更多的多值专栏。所以需要开发这样的代码,代码应该适用于任何数据。

平面文件输入数据

source_id,fname,lname,email,dob,line1,line2,line3,city,state,country
7,priya,kannan,shanthapriya794@gmail.com,07-12-1994,123,456,67,mdu,tn,india
7,priya,kannan,shanthapriya7964@gmail.com,07-12-1994,123,456,67,mdu,tn,india

输出得到

[{
"source_id": 7,
"fname": "priya",
"lname": "kannan",
"date_of_birth": "07-12-1994",
"address": [{
"line1": 123,
"line2": 456,
"line3": 67,
"city": "mdu",
"state": "tn",
"country": "india"
}, {
"line1": 123,
"line2": 456,
"line3": 67,
"city": "mdu",
"state": "tn",
"country": "india"
}]
}]

预期输出

[{
"source_id": 7,
"fname": "priya",
"lname": "kannan",
"date_of_birth": "07-12-1994",
"email" : ["shanthapriya794@gmail.com","shanthapriya7964@gmail.com"],
"address": [{
"line1": 123,
"line2": 456,
"line3": 67,
"city": "mdu",
"state": "tn",
"country": "india"
}]
}]

代码尝试

file.py

import pandas as pd
import json
from configuration import config
def main():
path = config['path']['input_file_path']
reg_col = config['columns']['reg_fields']
multivalued_fields = config['columns']['multi_value']
multivalued_fields = list(multivalued_fields.split(","))
g_cols = list(reg_col.split(","))
df = pd.read_csv(path, sep=",", header=0)
cols = df.columns[~df.columns.isin(g_cols)]
g_cols = [ele for ele in g_cols if ele not in multivalued_fields]
i=0
while i < len(multivalued_fields):
j = multivalued_fields[i]
df2 = (df.sort_values(g_cols).set_index(g_cols).assign(j=df.groupby(g_cols)[j].agg(lambda x: tuple(pd.unique(x)))).reset_index())
i = i + 1

df3 = df2.drop_duplicates().groupby(g_cols)[cols].apply(lambda x: x.to_dict('records')).reset_index(name='address').to_dict('record')
df3 = pd.DataFrame(df3)
return df3
def writefile_toJson(df):
df.to_json('outputfiles/jsonstructure1.json', orient='records')
if __name__ == "__main__":
df = main()
writefile_toJson(df)

config.ini

[path]
input_file_path = Input_Files/flat_test_file.txt
[columns]
reg_fields = source_id,fname,lname,email,date_of_birth
multi_value = email

configuration.py

from configparser import ConfigParser
# Loading configuration details
file = "config.ini"
config = ConfigParser()
config.read(file)
import pandas as pd
import configparser, ast, json
from configuration import config
import ast
def main():
path = config['path']['input_file_path']
json_path = config['json_path']['input_json_path']
g_cols = config['columns']['reg_fields']
g_cols = list(g_cols.split(","))
multivalued_fields = config['columns']['multi_value']
multivalued_fields = list(multivalued_fields.split(","))
print(multivalued_fields)
df = pd.read_csv(path, sep=",", header=0)
with open(json_path, 'r') as f:
dict_val = json.load(f)
df.columns = df.columns.to_series().map(dict_val)
cols= df.columns[~df.columns.isin(g_cols)]
g_cols = [ele for ele in g_cols if ele not in multivalued_fields]
df1 = df.groupby(g_cols, as_index=False)[multivalued_fields].agg(lambda x: set(x)).to_dict('records')
print("df1",df1)
print("cols", cols)
print("gcols", g_cols)
df31 = pd.DataFrame(df1)
print(df31)
df2 = df.drop_duplicates().groupby(g_cols)[cols].apply(lambda x: x.to_dict('records')).reset_index(name='address').to_dict('record')
print(df2)
my_finallist=[]
for i in range(0,len(df2)):
my_finallist =[dict(s) for s in set(frozenset(d.items()) for d in df2[i]["address"])]
print("my_finallist :",list(my_finallist))
df2[i]["address"] =my_finallist
df3 = pd.DataFrame(df2)
df4= pd.merge(df3,df31)
print(df3)
return df4
def writefile_toJson(df):
df.to_json('outputfiles/jsonstructure1.json', orient='records')
if __name__ == "__main__":
df = main()
writefile_toJson(df)

最新更新