我有3个csv文件,贷款,客户和安全,每个文件都相当大(800k+行)。每个文件由单个列唯一标识符链接。我的目标是创建一个JSON文件,下面的代码实现了这一点,但是,它非常慢。
我的问题是我怎样才能更快地达到这个结果?
import csv
import json
def multicsvtojson():
loanscsvfile = open('C:\***\loan.csv', 'r')
custcsvfile = open('C:\***\customer.csv', 'r')
securcsvfile = open('C:\***\security.csv', 'r')
loansreader = csv.DictReader(loanscsvfile, delimiter=',')
custreader = csv.DictReader(custcsvfile, delimiter=',')
securreader = csv.DictReader(securcsvfile, delimiter=',')
jsonfile = open('test.json', 'w')
#ready json file
output = []
loanscount = 0
#total loan count
for loansrow in loansreader:
loansrow['customers'] = []
loansrow['securities'] = []
output.append(loansrow)
custcsvfile.seek(0)
securcsvfile.seek(0)
for custrow in custreader:
if (loansrow["UniqueID"] == custrow["UniqueID"]):
loansrow['customers'].append(custrow)
for securrow in securreader:
if (loansrow["UniqueID"] == securrow["UniqueID"]):
loansrow['securities'].append(securrow)
loanscount = loanscount + 1 #increment the loan counter
print(loanscount)
total = {}
total['total'] = loanscount
output.insert(0, total)
json.dump(output, jsonfile, indent=4)
当前输出如下
[{
"total": 2
},
{
"uniqueID": "",
"uniqueID2": "",
"colA": "",
"colB": "",
"colC": "",
"colD": "",
"customers": [
{
"uniqueID": "",
"custID": "",
"colA": "",
"colB": "",
}
],
"securities": [
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
}
]
},
{
"uniqueID": "",
"uniqueID2": "",
"colA": "",
"colB": "",
"colC": "",
"colD": "",
"customers": [
{
"uniqueID": "",
"custID": "",
"colA": "",
"colB": "",
},
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
}
],
"securities": [
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
},
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
}
]
}
}]
对性能造成损失的可能是对客户和证券文件的多次读取。因为你在每个贷款行都重读了整个文件。
也许可以尝试在访问贷款文件之前按ID重新分组客户和证券,这样您就不必每次都重新阅读。
import csv
import json
def multicsvtojson():
loanscsvfile = open('C:\***\loan.csv', 'r')
custcsvfile = open('C:\***\customer.csv', 'r')
securcsvfile = open('C:\***\security.csv', 'r')
loansreader = csv.DictReader(loanscsvfile, delimiter=',')
custreader = csv.DictReader(custcsvfile, delimiter=',')
securreader = csv.DictReader(securcsvfile, delimiter=',')
jsonfile = open('test.json', 'w')
#ready json file
output = []
loanscount = 0
# regroup customers by ID
customers = {}
for custrow in custreader:
id = custrow["UniqueID"]
if id not in customers:
customers[id] = []
customers[id].append(custrow)
# regroup securities by ID
securities = {}
for securrow in securreader:
id = securrow["UniqueID"]
if id not in securities:
securities[id] = []
securities[id].append(securrow)
#total loan count
for loansrow in loansreader:
loansrow['customers'] = customers.get("UniqueID", [])
loansrow['securities'] = securities.get("UniqueID", [])
output.append(loansrow)
loanscount = loanscount + 1 #increment the loan counter
print(loanscount)
total = {}
total['total'] = loanscount
output.insert(0, total)
json.dump(output, jsonfile, indent=4)
如果CSV文件太大而无法在内存中处理,您也可以尝试使用临时数据库,如tinydb。