Python - 脚本从JSON中读取一个值,并在CSV中写入200次



我正在编写一个Python脚本,该脚本从多个JSON文件中读取数据并将其写入单个输出CSV文件中。我已经编写了一些代码,但它不正确。为了简单起见,我在这里格式化了 JSON,否则它会在一行上。每个"requestId"包含多个"id"值。我当前的代码只能读取一个"id"并重复 200 次。不知道为什么会这样。

JSON 文件

{  
"success":true,
"errors":[  
],
"requestId":"3561c",
"result":[  
{  
"id":257268,
"name":"02 ",
"description":"",
"createdAt":"2017-10-06T11:29:40Z+0000",
"updatedAt":"2017-11-07T13:38:11Z+0000",
"url":"https",
"subject":{  
"type":"Text",
"value":"Are you ready"
},
"fromName":{  
"type":"Text",
"value":"Centre"
},
"fromEmail":{  
"type":"Text",
"value":"abc@xyz.com"
},
"replyEmail":{  
"type":"Text",
"value":"noreply@xwz.com"
},
"folder":{  
"type":"Folder",
"value":8041,
"folderName":"A"
},
"operational":false,
"textOnly":false,
"publishToMSI":false,
"webView":false,
"status":"approved",
"template":681,
"workspace":"R",
"version":1,
"autoCopyToText":false
},
{  
"id":257273,
"name":"02a",
"description":"",
"createdAt":"2017-10-06T11:29:46Z+0000",
"updatedAt":"2017-11-07T13:38:19Z+0000",
"url":"https:",
"subject":{  
"type":"Text",
"value":"Still have questions?"
},
"fromName":{  
"type":"Text",
"value":"Centre"
},
"fromEmail":{  
"type":"Text",
"value":"abc@xyz.com"
},
"replyEmail":{  
"type":"Text",
"value":"noreply@xwz.com"
},
"folder":{  
"type":"Folder",
"value":8041,
"folderName":"A"
},
"operational":false,
"textOnly":false,
"publishToMSI":false,
"webView":false,
"status":"approved",
"template":681,
"workspace":"R",
"version":1,
"autoCopyToText":false },

蟒蛇代码

import json
import csv
import os
import codecs
import sys
reload(sys)
sys.setdefaultencoding('utf8')
file_dir = os.path.normpath('/home/pp/jobs/staging/')
exp_dir = os.path.normpath('/home/pp/jobs/CSV/')
exp_file_name = 'emails.csv'
exp_path = os.path.join(exp_dir, exp_file_name)

my_dict_list =[]
try:
for f in os.listdir(file_dir):
if f.endswith('.json') and f.startswith('emails_'):
file_path = os.path.join(file_dir, f)
data = open(file_path, 'r')
for line in data:
my_dict = {}
parsed_data = json.loads(line)
my_dict["REQUEST_ID"] = parsed_data["requestId"]
my_dict["SUCCESS"] = parsed_data["success"]
for result in parsed_data["result"]:
my_dict["RESULT_ID"] = result["id"]
my_dict["NAME"] = result["name"]
my_dict["DESCRIPTION"] = result.get("description")
my_dict["STATUS"] = result["status"].encode('utf-8')
my_dict["FOLDER_TYPE"] = result["folder"]["type"]
my_dict["FOLDER_ID"] = result["folder"]["value"]
my_dict["FOLDER_NAME"] = result["folder"]["folderName"]
my_dict["FROM_EMAIL_TYPE"] = result["fromEmail"]["type"]
my_dict["FROM_EMAIL_VALUE"] = result["fromEmail"]["value"]
my_dict["FROM_NAME_TYPE"] = result["fromName"]["type"]
my_dict["FROM_NAME_VALUE"] = result["fromName"]["value"]
my_dict["REPLY_EMAIL_TYPE"] = result["replyEmail"]["type"]
my_dict["REPLY_EMAIL_VALUE"] = result["replyEmail"]["value"]
my_dict["SUBJECT_TYPE"] = result["subject"]["type"]
my_dict["SUBJECT_VALUE"] = result["subject"]["value"]
my_dict["OPERATIONAL"] = result["operational"]
my_dict["PUBLISH_TO_MSI"] = result["publishToMSI"]
my_dict["TEMPLATE"] = result["template"]
my_dict["TEXT_ONLY"] = result["textOnly"]
my_dict["URL"] = result.get("url")
my_dict["WEBVIEW"] = result["webView"]
my_dict["CREATED_AT"] = result["createdAt"]
my_dict["UPDATED_AT"] = result["updatedAt"]
my_dict["WORKSPACE"] = result["workspace"]
my_dict_list.append(my_dict)
csv_columns = ["REQUEST_ID","SUCCESS","RESULT_ID","NAME","DESCRIPTION","STATUS","FOLDER_TYPE","FOLDER_ID","FOLDER_NAME","FROM_EMAIL_TYPE","FROM_EMAIL_VALUE","FROM_NAME_TYPE","FROM_NAME_VALUE","REPLY_EMAIL_TYPE","REPLY_EMAIL_VALUE","SUBJECT_TYPE","SUBJECT_VALUE","OPERATIONAL","PUBLISH_TO_MSI","TEMPLATE","TEXT_ONLY","URL","WEBVIEW","CREATED_AT","UPDATED_AT","WORKSPACE"]
with open(exp_path,'wb') as csvfile:
xz = csv.DictWriter(csvfile,fieldnames=csv_columns)
headers = {}
for n in xz.fieldnames:
headers[n] = n
xz.writerow(headers)
for data in my_dict_list:
xz.writerow(data)
except Exception as exception:
print("Please check the logs. JSON to CSV conversion failed for Emails: ", exception)

看这里:

my_dict_list =[]
try:
for f in os.listdir(file_dir):
if f.endswith('.json') and f.startswith('emails_'):
file_path = os.path.join(file_dir, f)
data = open(file_path, 'r')
for line in data:
my_dict = {}
parsed_data = json.loads(line)
# ...
for result in parsed_data["result"]:
# ...
my_dict_list.append(my_dict)

my_dict是一个仅在文件行级别更新的字典。但是你想做的似乎是parsed_data["result"]的每个元素。如果你将相同的字典附加到循环中的列表并改变它,你实际上是在将多个相同的副本放入列表中,当你变异时,你改变了所有副本。("复制"在Python中是一个不好的术语,因为它们实际上只是引用(

要解决您的问题,请尝试替换它:

my_dict_list.append(my_dict)

有了这个:

my_dict_list.append(dict(my_dict))

这将在放入列表之前制作一个(浅(副本。

这是 Python 中常见的问题。这里重要的是my_dict是指向字典的指针。

这里发生的事情是,您正在定义my_dict(指向字典的指针(,使用一组值更新它,然后将其附加到列表中。然后在循环的第二次迭代中,更改 my_dict 的值并将其附加到数组中的第二个位置。但是,my_dict也位于数组的第一个位置。因此,my_dict的值现在在数组的索引 0 和索引 1 中都更新了。

因此,列表中每个字典中的所有值都会更新,而不仅仅是 Id。这一直持续到循环的最后一次迭代,此时列表中的所有条目(它们都是my_dict的(都更新为 result 中最后一个字典的值。

解决此问题的一种方法是在每次迭代中定义一个新字典。

for line in data:
parsed_data = json.loads(line)
for result in parsed_data["result"]:
my_dict = {}
my_dict["REQUEST_ID"] = parsed_data["requestId"]
my_dict["SUCCESS"] = parsed_data["success"]
my_dict["RESULT_ID"] = result["id"]
my_dict["NAME"] = result["name"]
my_dict["DESCRIPTION"] = result.get("description")
my_dict["STATUS"] = result["status"].encode('utf-8')
my_dict["FOLDER_TYPE"] = result["folder"]["type"]
my_dict["FOLDER_ID"] = result["folder"]["value"]
my_dict["FOLDER_NAME"] = result["folder"]["folderName"]
my_dict["FROM_EMAIL_TYPE"] = result["fromEmail"]["type"]
my_dict["FROM_EMAIL_VALUE"] = result["fromEmail"]["value"]
my_dict["FROM_NAME_TYPE"] = result["fromName"]["type"]
my_dict["FROM_NAME_VALUE"] = result["fromName"]["value"]
my_dict["REPLY_EMAIL_TYPE"] = result["replyEmail"]["type"]
my_dict["REPLY_EMAIL_VALUE"] = result["replyEmail"]["value"]
my_dict["SUBJECT_TYPE"] = result["subject"]["type"]
my_dict["SUBJECT_VALUE"] = result["subject"]["value"]
my_dict["OPERATIONAL"] = result["operational"]
my_dict["PUBLISH_TO_MSI"] = result["publishToMSI"]
my_dict["TEMPLATE"] = result["template"]
my_dict["TEXT_ONLY"] = result["textOnly"]
my_dict["URL"] = result.get("url")
my_dict["WEBVIEW"] = result["webView"]
my_dict["CREATED_AT"] = result["createdAt"]
my_dict["UPDATED_AT"] = result["updatedAt"]
my_dict["WORKSPACE"] = result["workspace"]
my_dict_list.append(my_dict)

既然每个文件已经是一行了,为什么还要从文件中读取行?
这部分:

data = open(file_path, 'r')
for line in data:
my_dict = {}
parsed_data = json.loads(line)

可以减少到:

my_dict = {}
parsed_data = json.loads(open(file_path, 'r').read())

相关内容

最新更新