如何将相同类型的JSON对象列表转换为Pandas数据帧



我正在读取一个包含网络上大量事件信息的文件。我想要每种事件类型的Pandas数据帧。我已经阅读了文件,并根据事件进行了筛选。现在,我为每个事件都有一个单独的对象列表。

import json
#Event Data will be stored in lists
flowData = []
alertData = []
dhcpData = []
statsData = []
httpData = []
fileInfoData = []
sshData = []
dnsData = []
tlsData = []
data = []
with open('eve.json') as file:
for line in file:
data.append(json.loads(line))
for line in data:
if line["event_type"]=="flow" in line:
flowData.append(line)
elif line["event_type"]=="alert":
alertData.append(line)
elif line["event_type"]=="dhcp":
dhcpData.append(line)
elif line["event_type"]=="stats":
statsData.append(line)
elif line["event_type"]=="http":
httpData.append(line)
elif line["event_type"]=="fileinfo":
fileInfoData.append(line)
elif line["event_type"]=="ssh":
sshData.append(line)
elif line["event_type"]=="dns":
dnsData.append(line)
elif line["event_type"]=="tls":
tlsData.append(line)
else:
"unsupported line:"
print(line)

如何将每个列表转换为自己的panda数据帧。我知道他们的方法是panda方法pd.read_json,但是由于我的json文件包含不同类型的对象,所以这个方法不起作用。

示例JSON

{"timestamp":"2020-03-26T10:16:13.248647+0000","event_type":"stats","stats":{"uptime":42,"capture":{"kernel_packets":33,"kernel_drops":0,"errors":0},"decoder":{"pkts":33,"bytes":10827,"invalid":0,"ipv4":19,"ipv6":0,"ethernet":33,"raw":0,"null":0,"sll":0,"tcp":0,"udp":19,"sctp":0,"icmpv4":0,"icmpv6":0,"ppp":0,"pppoe":0,"gre":0,"vlan":0,"vlan_qinq":0,"ieee8021ah":0,"teredo":0,"ipv4_in_ipv6":0,"ipv6_in_ipv6":0,"mpls":0,"avg_pkt_size":328,"max_pkt_size":494,"erspan":0,"ipraw":{"invalid_ip_version":0},"ltnull":{"pkt_too_small":0,"unsupported_type":0},"dce":{"pkt_too_small":0}},"flow":{"memcap":0,"tcp":0,"udp":12,"icmpv4":0,"icmpv6":0,"spare":10000,"emerg_mode_entered":0,"emerg_mode_over":0,"tcp_reuse":0,"memuse":6597184},"defrag":{"ipv4":{"fragments":0,"reassembled":0,"timeouts":0},"ipv6":{"fragments":0,"reassembled":0,"timeouts":0},"max_frag_hits":0},"tcp":{"sessions":0,"ssn_memcap_drop":0,"pseudo":0,"pseudo_failed":0,"invalid_checksum":0,"no_flow":0,"syn":0,"synack":0,"rst":0,"midstream_pickups":0,"pkt_on_wrong_thread":0,"segment_memcap_drop":0,"stream_depth_reached":0,"reassembly_gap":0,"overlap":0,"overlap_diff_data":0,"insert_data_normal_fail":0,"insert_data_overlap_fail":0,"insert_list_fail":0,"memuse":2031616,"reassembly_memuse":294912},"detect":{"engines":[{"id":0,"last_reload":"2020-03-26T10:16:03.882522+0000","rules_loaded":13659,"rules_failed":0}],"alert":0},"app_layer":{"flow":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":0,"imap":0,"msn":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ntp":0,"ftp-data":0,"tftp":0,"ikev2":0,"krb5_tcp":0,"dhcp":0,"failed_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"krb5_udp":0,"failed_udp":12},"tx":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ftp-data":0,"krb5_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"ntp":0,"tftp":0,"ikev2":0,"krb5_udp":0,"dhcp":0},"expectations":0},"flow_mgr":{"closed_pruned":0,"new_pruned":0,"est_pruned":0,"bypassed_pruned":0,"flows_checked":0,"flows_notimeout":0,"flows_timeout":0,"flows_timeout_inuse":0,"flows_removed":0,"rows_checked":65536,"rows_skipped":65536,"rows_empty":0,"rows_busy":0,"rows_maxlen":0},"dns":{"memuse":0,"memcap_state":0,"memcap_global":0},"http":{"memuse":0,"memcap":0},"ftp":{"memuse":0,"memcap":0}}}
{"timestamp":"2020-03-26T10:16:14.770482+0000","flow_id":771992469731822,"in_iface":"wlan0","event_type":"ssh","src_ip":"192.168.1.222","src_port":59521,"dest_ip":"192.168.1.234","dest_port":22,"proto":"TCP","ssh":{"client":{"proto_version":"2.0","software_version":"WinSCP_release_5.15.9"},"server":{"proto_version":"2.0","software_version":"OpenSSH_7.9p1 Raspbian-10+deb10u2"}}}
{"timestamp":"2020-03-26T10:16:19.435893+0000","flow_id":2061547810825909,"in_iface":"wlan0","event_type":"alert","src_ip":"192.168.1.194","src_port":57621,"dest_ip":"192.168.1.255","dest_port":57621,"proto":"UDP","alert":{"action":"allowed","gid":1,"signature_id":2027397,"rev":1,"signature":"ET POLICY Spotify P2P Client","category":"Not Suspicious Traffic","severity":3,"metadata":{"updated_at":["2019_05_30"],"performance_impact":["Low"],"created_at":["2019_05_30"],"signature_severity":["Minor"],"deployment":["Internal"],"attack_target":["Client_Endpoint"],"affected_product":["Windows_Client_Apps"]}},"app_proto":"failed","flow":{"pkts_toserver":1,"pkts_toclient":0,"bytes_toserver":86,"bytes_toclient":0,"start":"2020-03-26T10:16:19.435893+0000"}}
{"timestamp":"2020-03-26T10:16:21.252753+0000","event_type":"stats","stats":{"uptime":50,"capture":{"kernel_packets":121,"kernel_drops":0,"errors":0},"decoder":{"pkts":121,"bytes":28365,"invalid":0,"ipv4":86,"ipv6":0,"ethernet":121,"raw":0,"null":0,"sll":0,"tcp":49,"udp":37,"sctp":0,"icmpv4":0,"icmpv6":0,"ppp":0,"pppoe":0,"gre":0,"vlan":0,"vlan_qinq":0,"ieee8021ah":0,"teredo":0,"ipv4_in_ipv6":0,"ipv6_in_ipv6":0,"mpls":0,"avg_pkt_size":234,"max_pkt_size":1158,"erspan":0,"ipraw":{"invalid_ip_version":0},"ltnull":{"pkt_too_small":0,"unsupported_type":0},"dce":{"pkt_too_small":0}},"flow":{"memcap":0,"tcp":1,"udp":22,"icmpv4":0,"icmpv6":0,"spare":9999,"emerg_mode_entered":0,"emerg_mode_over":0,"tcp_reuse":0,"memuse":6600064},"defrag":{"ipv4":{"fragments":0,"reassembled":0,"timeouts":0},"ipv6":{"fragments":0,"reassembled":0,"timeouts":0},"max_frag_hits":0},"tcp":{"sessions":1,"ssn_memcap_drop":0,"pseudo":0,"pseudo_failed":0,"invalid_checksum":0,"no_flow":0,"syn":1,"synack":1,"rst":0,"midstream_pickups":0,"pkt_on_wrong_thread":0,"segment_memcap_drop":0,"stream_depth_reached":0,"reassembly_gap":0,"overlap":5,"overlap_diff_data":0,"insert_data_normal_fail":0,"insert_data_overlap_fail":0,"insert_list_fail":0,"memuse":2031616,"reassembly_memuse":294912},"detect":{"engines":[{"id":0,"last_reload":"2020-03-26T10:16:03.882522+0000","rules_loaded":13659,"rules_failed":0}],"alert":1},"app_layer":{"flow":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":1,"imap":0,"msn":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ntp":0,"ftp-data":0,"tftp":0,"ikev2":0,"krb5_tcp":0,"dhcp":0,"failed_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"krb5_udp":0,"failed_udp":22},"tx":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ftp-data":0,"krb5_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"ntp":0,"tftp":0,"ikev2":0,"krb5_udp":0,"dhcp":0},"expectations":0},"flow_mgr":{"closed_pruned":0,"new_pruned":0,"est_pruned":0,"bypassed_pruned":0,"flows_checked":2,"flows_notimeout":2,"flows_timeout":0,"flows_timeout_inuse":0,"flows_removed":0,"rows_checked":65536,"rows_skipped":65534,"rows_empty":0,"rows_busy":0,"rows_maxlen":1},"dns":{"memuse":0,"memcap_state":0,"memcap_global":0},"http":{"memuse":0,"memcap":0},"ftp":{"memuse":0,"memcap":0}}}
{"timestamp":"2020-03-26T10:16:24.972539+0000","flow_id":342276697544443,"in_iface":"wlan0","event_type":"alert","src_ip":"fe80:0000:0000:0000:f286:20ff:fe42:f22b","dest_ip":"ff02:0000:0000:0000:0000:0000:0000:0001","proto":"IPv6-ICMP","icmp_type":134,"icmp_code":0,"alert":{"action":"allowed","gid":1,"signature_id":0,"rev":0,"signature":"RULE TRIGGRED","category":"","severity":3},"flow":{"pkts_toserver":1,"pkts_toclient":0,"bytes_toserver":118,"bytes_toclient":0,"start":"2020-03-26T10:16:24.972539+0000"}}
{"timestamp":"2020-03-26T10:16:24.974956+0000","flow_id":573109714870380,"in_iface":"wlan0","event_type":"alert","src_ip":"fe80:0000:0000:0000:986d:1d1b:673f:b319","dest_ip":"ff02:0000:0000:0000:0000:0000:0000:0001","proto":"IPv6-ICMP","icmp_type":136,"icmp_code":0,"alert":{"action":"allowed","gid":1,"signature_id":0,"rev":0,"signature":"RULE TRIGGRED","category":"","severity":3},"flow":{"pkts_toserver":1,"pkts_toclient":0,"bytes_toserver":86,"bytes_toclient":0,"start":"2020-03-26T10:16:24.974956+0000"}}
{"timestamp":"2020-03-26T10:16:24.998168+0000","flow_id":607027071564568,"in_iface":"wlan0","event_type":"alert","src_ip":"fe80:0000:0000:0000:986d:1d1b:673f:b319","dest_ip":"ff02:0000:0000:0000:0000:0000:0000:0016","proto":"IPv6-ICMP","icmp_type":143,"icmp_code":0,"alert":{"action":"allowed","gid":1,"signature_id":0,"rev":0,"signature":"RULE TRIGGRED","category":"","severity":3},"flow":{"pkts_toserver":1,"pkts_toclient":0,"bytes_toserver":130,"bytes_toclient":0,"start":"2020-03-26T10:16:24.998168+0000"}}
{"timestamp":"2020-03-26T10:16:29.257116+0000","event_type":"stats","stats":{"uptime":58,"capture":{"kernel_packets":181,"kernel_drops":0,"errors":0},"decoder":{"pkts":181,"bytes":45698,"invalid":0,"ipv4":116,"ipv6":8,"ethernet":181,"raw":0,"null":0,"sll":0,"tcp":66,"udp":52,"sctp":0,"icmpv4":0,"icmpv6":6,"ppp":0,"pppoe":0,"gre":0,"vlan":0,"vlan_qinq":0,"ieee8021ah":0,"teredo":0,"ipv4_in_ipv6":0,"ipv6_in_ipv6":0,"mpls":0,"avg_pkt_size":252,"max_pkt_size":1514,"erspan":0,"ipraw":{"invalid_ip_version":0},"ltnull":{"pkt_too_small":0,"unsupported_type":0},"dce":{"pkt_too_small":0}},"flow":{"memcap":0,"tcp":1,"udp":31,"icmpv4":0,"icmpv6":3,"spare":10000,"emerg_mode_entered":0,"emerg_mode_over":0,"tcp_reuse":0,"memuse":6602704},"defrag":{"ipv4":{"fragments":0,"reassembled":0,"timeouts":0},"ipv6":{"fragments":0,"reassembled":0,"timeouts":0},"max_frag_hits":0},"tcp":{"sessions":1,"ssn_memcap_drop":0,"pseudo":0,"pseudo_failed":0,"invalid_checksum":0,"no_flow":0,"syn":1,"synack":1,"rst":0,"midstream_pickups":0,"pkt_on_wrong_thread":0,"segment_memcap_drop":0,"stream_depth_reached":0,"reassembly_gap":0,"overlap":5,"overlap_diff_data":0,"insert_data_normal_fail":0,"insert_data_overlap_fail":0,"insert_list_fail":0,"memuse":2031616,"reassembly_memuse":294912},"detect":{"engines":[{"id":0,"last_reload":"2020-03-26T10:16:03.882522+0000","rules_loaded":13659,"rules_failed":0}],"alert":4},"app_layer":{"flow":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":1,"imap":0,"msn":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ntp":0,"ftp-data":0,"tftp":0,"ikev2":0,"krb5_tcp":0,"dhcp":0,"failed_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"krb5_udp":0,"failed_udp":31},"tx":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ftp-data":0,"krb5_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"ntp":0,"tftp":0,"ikev2":0,"krb5_udp":0,"dhcp":0},"expectations":0},"flow_mgr":{"closed_pruned":0,"new_pruned":0,"est_pruned":0,"bypassed_pruned":0,"flows_checked":0,"flows_notimeout":0,"flows_timeout":0,"flows_timeout_inuse":0,"flows_removed":0,"rows_checked":65536,"rows_skipped":65536,"rows_empty":0,"rows_busy":0,"rows_maxlen":0},"dns":{"memuse":0,"memcap_state":0,"memcap_global":0},"http":{"memuse":0,"memcap":0},"ftp":{"memuse":0,"memcap":0}}}

我建议在这里使用defauldict

data = []
with open('eve.json') as file:
for line in file:
data.append(json.loads(line))
from collections import defaultdict
d = defaultdict(list)
for line in data:
d[line["event_type"]].append(line)

然后分别为每个列表的DataFrame构造函数用于数据帧列表:

dfs = {k: pd.DataFrame(v) for k, v in d.items()}

print (dfs['alert'])
timestamp           flow_id in_iface event_type  
0  2020-03-26T10:16:19.435893+0000  2061547810825909    wlan0      alert   
1  2020-03-26T10:16:24.972539+0000   342276697544443    wlan0      alert   
2  2020-03-26T10:16:24.974956+0000   573109714870380    wlan0      alert   
3  2020-03-26T10:16:24.998168+0000   607027071564568    wlan0      alert   
src_ip  src_port  
0                            192.168.1.194   57621.0   
1  fe80:0000:0000:0000:f286:20ff:fe42:f22b       NaN   
2  fe80:0000:0000:0000:986d:1d1b:673f:b319       NaN   
3  fe80:0000:0000:0000:986d:1d1b:673f:b319       NaN   
dest_ip  dest_port      proto  
0                            192.168.1.255    57621.0        UDP   
1  ff02:0000:0000:0000:0000:0000:0000:0001        NaN  IPv6-ICMP   
2  ff02:0000:0000:0000:0000:0000:0000:0001        NaN  IPv6-ICMP   
3  ff02:0000:0000:0000:0000:0000:0000:0016        NaN  IPv6-ICMP   
alert app_proto  
0  {'action': 'allowed', 'gid': 1, 'signature_id'...    failed   
1  {'action': 'allowed', 'gid': 1, 'signature_id'...       NaN   
2  {'action': 'allowed', 'gid': 1, 'signature_id'...       NaN   
3  {'action': 'allowed', 'gid': 1, 'signature_id'...       NaN   
flow  icmp_type  icmp_code  
0  {'pkts_toserver': 1, 'pkts_toclient': 0, 'byte...        NaN        NaN  
1  {'pkts_toserver': 1, 'pkts_toclient': 0, 'byte...      134.0        0.0  
2  {'pkts_toserver': 1, 'pkts_toclient': 0, 'byte...      136.0        0.0  
3  {'pkts_toserver': 1, 'pkts_toclient': 0, 'byte...      143.0        0.0  
print (dfs['ssh'])
timestamp          flow_id in_iface event_type  
0  2020-03-26T10:16:14.770482+0000  771992469731822    wlan0        ssh   
src_ip  src_port        dest_ip  dest_port proto  
0  192.168.1.222     59521  192.168.1.234         22   TCP   
ssh  
0  {'client': {'proto_version': '2.0', 'software_...  

如果想要更多的扁平化数据,请使用json.json_normalize:

from pandas.io.json import json_normalize
dfs = {k: json_normalize(v) for k, v in d.items()}
print (dfs['alert'])
timestamp           flow_id in_iface event_type  
0  2020-03-26T10:16:19.435893+0000  2061547810825909    wlan0      alert   
1  2020-03-26T10:16:24.972539+0000   342276697544443    wlan0      alert   
2  2020-03-26T10:16:24.974956+0000   573109714870380    wlan0      alert   
3  2020-03-26T10:16:24.998168+0000   607027071564568    wlan0      alert   
src_ip  src_port  
0                            192.168.1.194   57621.0   
1  fe80:0000:0000:0000:f286:20ff:fe42:f22b       NaN   
2  fe80:0000:0000:0000:986d:1d1b:673f:b319       NaN   
3  fe80:0000:0000:0000:986d:1d1b:673f:b319       NaN   
dest_ip  dest_port      proto app_proto  
0                            192.168.1.255    57621.0        UDP    failed   
1  ff02:0000:0000:0000:0000:0000:0000:0001        NaN  IPv6-ICMP       NaN   
2  ff02:0000:0000:0000:0000:0000:0000:0001        NaN  IPv6-ICMP       NaN   
3  ff02:0000:0000:0000:0000:0000:0000:0016        NaN  IPv6-ICMP       NaN   
alert.metadata.deployment  alert.metadata.attack_target  
0  ...                [Internal]             [Client_Endpoint]   
1  ...                       NaN                           NaN   
2  ...                       NaN                           NaN   
3  ...                       NaN                           NaN   
alert.metadata.affected_product  flow.pkts_toserver flow.pkts_toclient  
0            [Windows_Client_Apps]                   1                  0   
1                              NaN                   1                  0   
2                              NaN                   1                  0   
3                              NaN                   1                  0   
flow.bytes_toserver  flow.bytes_toclient                       flow.start  
0                  86                    0  2020-03-26T10:16:19.435893+0000   
1                 118                    0  2020-03-26T10:16:24.972539+0000   
2                  86                    0  2020-03-26T10:16:24.974956+0000   
3                 130                    0  2020-03-26T10:16:24.998168+0000   
icmp_type icmp_code  
0       NaN       NaN  
1     134.0       0.0  
2     136.0       0.0  
3     143.0       0.0  
[4 rows x 31 columns]
print (dfs['ssh'])
timestamp          flow_id in_iface event_type  
0  2020-03-26T10:16:14.770482+0000  771992469731822    wlan0        ssh   
src_ip  src_port        dest_ip  dest_port proto  
0  192.168.1.222     59521  192.168.1.234         22   TCP   
ssh.client.proto_version ssh.client.software_version  
0                      2.0       WinSCP_release_5.15.9   
ssh.server.proto_version        ssh.server.software_version  
0                      2.0  OpenSSH_7.9p1 Raspbian-10+deb10u2  

最新更新