比较两个json文件并匹配具有相同值的条目



我需要比较键上的两个json文件:a、b、c、d、e

如果json文件1和json文件2之间的条目的所有键都匹配所以我应该为这个条目找到deltabetwen平台时间。然后从json文件1和json文件2中删除这些条目。(两个json文件都有10000000000个条目(:

所以这里我们应该匹配:

1) one[0] and [two][1] 
2) one[1] and [two][] 

数据json一和json二:第一个文件-

"one": [
{
"a" : "2022-09-12 00:00:00.000",
"b" : "apple",
"c" : "1",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},
{
"a" : "2022-09-12 00:00:00.000",
"b" : "orange",
"c" : "2",
"d" : "2022-09-11 23:59:59.997",
"e" : 87
},      
{
"a" : "2022-09-12 00:00:10.001",
"b" : "apple",
"c" : "6",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},...]

第二个文件-

"two": [
{
"a" : "2022-09-12 00:00:30.000",
"b" : "orange",
"c" : "2",
"d" : "2022-09-11 23:59:59.997",
"e" : 87
},
{
"a" : "2022-09-12 00:00:10.001",
"b" : "apple",
"c" : "1",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},
{
"a" : "2022-09-12 00:00:30.000",
"b" : "orange",
"c" : "200",
"d" : "2021-09-11 23:59:59.997",
"e" : 81
},...
]

我开始做这样的事情,但所有元素的迭代都需要太多时间。

你能帮我优化代码吗?

import datetime
import json
import numpy as np
import random``
lst_in_seconds = []

f = open('one_all.json')
one = json.load(f)
f.close()

f1 = open('two_all.json')
two = json.load(f1)
f1.close()

counter_one_better = 0
counter_two_better = 0
counter_the_same = 0
for k in range(10000000000):
for i in range(10000000000):
if one['one'][k]['b'] == two['two'][i]['b'] and one['one'][k]['e'] == two['two'][i]['e'] 
and one['one'][k]['amount'] == two['two'][i]['amount'] 
and one['one'][k]['d'] == two['two'][i]['d'] 
and one['one'][k]['c'] == two['two'][i]['c']:
if (one['one'][k]['a']) < (two['two'][i]['a']):
# one better than two
delt_one = datetime.datetime.strptime((one['one'][k]['a']), '%Y-%m-%d %H:%M:%S.%f')
delt_two = datetime.datetime.strptime((two['two'][i]['a']), '%Y-%m-%d %H:%M:%S.%f')
delta = delt_two - delt_one
diff_in_seconds = delta.total_seconds()
lst_in_seconds.append(diff_in_seconds)
counter_one_better += 1
two['two'][i]['b'] = random.randint(0,100000)
break
elif (one['one'][k]['a']) == (two['two'][i]['a']):
# same
delt_one = datetime.datetime.strptime((one['one'][k]['a']), '%Y-%m-%d %H:%M:%S.%f')
delt_two = datetime.datetime.strptime((two['two'][i]['a']), '%Y-%m-%d %H:%M:%S.%f')
delta = delt_two - delt_one
diff_in_seconds = delta.total_seconds()
lst_in_seconds.append(diff_in_seconds)
counter_the_same += 1
two['two'][i]['b'] = random.randint(0,100000)
break
elif (one['one'][k]['a']) > (two['two'][i]['a']):
delt_one = datetime.datetime.strptime((one['one'][k]['a']), '%Y-%m-%d %H:%M:%S.%f')
delt_two = datetime.datetime.strptime((two['two'][i]['a']), '%Y-%m-%d %H:%M:%S.%f')
delta = delt_one - delt_two
diff_in_seconds = delta.total_seconds()
diff_in_seconds_to_str = float(('-' + str(diff_in_seconds)))
lst_in_seconds.append(diff_in_seconds_to_str)
counter_two_better += 1
two['two'][i]['b'] = random.randint(0,100000)
break
#print('counter_the_same',counter_the_same,'count')
#print('counter_one_better',counter_one_better,'count')
#print('counter_two_better',counter_two_better,'count','n')
print('one better than two in ', round((counter_one_better / (counter_two_better+counter_one_better+counter_the_same))*100,4),'% case')
print('the same ', round((counter_the_same / (counter_two_better+counter_one_better+counter_the_same))*100,4),'% case')
print('two better than one in ', round((counter_two_better / (counter_two_better+counter_one_better+counter_the_same))*100,4),'% case','n')

您可以通过将两个文件预处理为具有要比较的值的键的字典来减少比较文件所需的时间。然后,对于one中的每个条目,您可以查找two中具有相同bcde值的条目,并比较时间。请注意,您可以使您的代码更加"简单";DRY";通过注意CCD_ 7的三个分支中的唯一区别是哪个计数器被更新。

from collections import defaultdict
import datetime
import random
one = { "one": [
{
"a" : "2022-09-12 00:00:00.000",
"b" : "apple",
"c" : "1",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},
{
"a" : "2022-09-12 00:00:00.000",
"b" : "orange",
"c" : "2",
"d" : "2022-09-11 23:59:59.997",
"e" : 87
},      
{
"a" : "2022-09-12 00:00:10.001",
"b" : "apple",
"c" : "6",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
}
] }
two = { "two": [
{
"a" : "2022-09-12 00:00:30.000",
"b" : "orange",
"c" : "2",
"d" : "2022-09-11 23:59:59.997",
"e" : 87
},
{
"a" : "2022-09-12 00:00:10.001",
"b" : "apple",
"c" : "1",
"d" : "2022-09-11 23:59:59.997",
"e" : 88
},
{
"a" : "2022-09-12 00:00:30.000",
"b" : "orange",
"c" : "200",
"d" : "2021-09-11 23:59:59.997",
"e" : 81
}
] }
compare_keys = ['b', 'c', 'd', 'e']
value_key = 'a'
set_key = 'b'
ones = defaultdict(list)
for o in one['one']:
ones[tuple(o[k] for k in compare_keys)].append(o[value_key])
twos = defaultdict(list)
for t in two['two']:
twos[tuple(t[k] for k in compare_keys)].append({ k : t[k] for k in [value_key, set_key] })
counter_one_better = 0
counter_two_better = 0
counter_the_same = 0
lst_in_seconds = []
for k, o in ones.items():
t = twos.get(k)
if t is None:
continue
for o1 in o:
for i, t2 in enumerate(t):
delt_one = datetime.datetime.strptime(o1, '%Y-%m-%d %H:%M:%S.%f')
delt_two = datetime.datetime.strptime(t2[value_key], '%Y-%m-%d %H:%M:%S.%f')
delta = delt_two - delt_one
diff_in_seconds = delta.total_seconds()
lst_in_seconds.append(diff_in_seconds)
counter_one_better += diff_in_seconds < 0
counter_the_same += diff_in_seconds == 0
counter_two_better += diff_in_seconds > 0
t[i][set_key] = random.randint(0,100000)
print(counter_one_better, counter_the_same, counter_two_better)
print(lst_in_seconds)
# reconstruct the two dict
new_two = { 'two' : [ dict([*zip(compare_keys, k), *v.items()]) for k in twos for v in twos[k] ] }

输出(用于您的样本数据(:

# counter_one_better, counter_the_same, counter_two_better
0 0 2
# lst_in_seconds
[10.001, 30.0]
# new_two
{
"two": [
{
"b": 4459,
"c": "2",
"d": "2022-09-11 23:59:59.997",
"e": 87,
"a": "2022-09-12 00:00:30.000"
},
{
"b": 93855,
"c": "1",
"d": "2022-09-11 23:59:59.997",
"e": 88,
"a": "2022-09-12 00:00:10.001"
},
{
"b": "orange",
"c": "200",
"d": "2021-09-11 23:59:59.997",
"e": 81,
"a": "2022-09-12 00:00:30.000"
}
]
}

最新更新