我正在读取一个包含网络上大量事件信息的文件。我想要每种事件类型的Pandas数据帧。我已经阅读了文件,并根据事件进行了筛选。现在,我为每个事件都有一个单独的对象列表。
import json
#Event Data will be stored in lists
flowData = []
alertData = []
dhcpData = []
statsData = []
httpData = []
fileInfoData = []
sshData = []
dnsData = []
tlsData = []
data = []
with open('eve.json') as file:
for line in file:
data.append(json.loads(line))
for line in data:
if line["event_type"]=="flow" in line:
flowData.append(line)
elif line["event_type"]=="alert":
alertData.append(line)
elif line["event_type"]=="dhcp":
dhcpData.append(line)
elif line["event_type"]=="stats":
statsData.append(line)
elif line["event_type"]=="http":
httpData.append(line)
elif line["event_type"]=="fileinfo":
fileInfoData.append(line)
elif line["event_type"]=="ssh":
sshData.append(line)
elif line["event_type"]=="dns":
dnsData.append(line)
elif line["event_type"]=="tls":
tlsData.append(line)
else:
"unsupported line:"
print(line)
如何将每个列表转换为自己的panda数据帧。我知道他们的方法是panda方法pd.read_json,但是由于我的json文件包含不同类型的对象,所以这个方法不起作用。
示例JSON
{"timestamp":"2020-03-26T10:16:13.248647+0000","event_type":"stats","stats":{"uptime":42,"capture":{"kernel_packets":33,"kernel_drops":0,"errors":0},"decoder":{"pkts":33,"bytes":10827,"invalid":0,"ipv4":19,"ipv6":0,"ethernet":33,"raw":0,"null":0,"sll":0,"tcp":0,"udp":19,"sctp":0,"icmpv4":0,"icmpv6":0,"ppp":0,"pppoe":0,"gre":0,"vlan":0,"vlan_qinq":0,"ieee8021ah":0,"teredo":0,"ipv4_in_ipv6":0,"ipv6_in_ipv6":0,"mpls":0,"avg_pkt_size":328,"max_pkt_size":494,"erspan":0,"ipraw":{"invalid_ip_version":0},"ltnull":{"pkt_too_small":0,"unsupported_type":0},"dce":{"pkt_too_small":0}},"flow":{"memcap":0,"tcp":0,"udp":12,"icmpv4":0,"icmpv6":0,"spare":10000,"emerg_mode_entered":0,"emerg_mode_over":0,"tcp_reuse":0,"memuse":6597184},"defrag":{"ipv4":{"fragments":0,"reassembled":0,"timeouts":0},"ipv6":{"fragments":0,"reassembled":0,"timeouts":0},"max_frag_hits":0},"tcp":{"sessions":0,"ssn_memcap_drop":0,"pseudo":0,"pseudo_failed":0,"invalid_checksum":0,"no_flow":0,"syn":0,"synack":0,"rst":0,"midstream_pickups":0,"pkt_on_wrong_thread":0,"segment_memcap_drop":0,"stream_depth_reached":0,"reassembly_gap":0,"overlap":0,"overlap_diff_data":0,"insert_data_normal_fail":0,"insert_data_overlap_fail":0,"insert_list_fail":0,"memuse":2031616,"reassembly_memuse":294912},"detect":{"engines":[{"id":0,"last_reload":"2020-03-26T10:16:03.882522+0000","rules_loaded":13659,"rules_failed":0}],"alert":0},"app_layer":{"flow":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":0,"imap":0,"msn":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ntp":0,"ftp-data":0,"tftp":0,"ikev2":0,"krb5_tcp":0,"dhcp":0,"failed_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"krb5_udp":0,"failed_udp":12},"tx":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ftp-data":0,"krb5_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"ntp":0,"tftp":0,"ikev2":0,"krb5_udp":0,"dhcp":0},"expectations":0},"flow_mgr":{"closed_pruned":0,"new_pruned":0,"est_pruned":0,"bypassed_pruned":0,"flows_checked":0,"flows_notimeout":0,"flows_timeout":0,"flows_timeout_inuse":0,"flows_removed":0,"rows_checked":65536,"rows_skipped":65536,"rows_empty":0,"rows_busy":0,"rows_maxlen":0},"dns":{"memuse":0,"memcap_state":0,"memcap_global":0},"http":{"memuse":0,"memcap":0},"ftp":{"memuse":0,"memcap":0}}}
{"timestamp":"2020-03-26T10:16:14.770482+0000","flow_id":771992469731822,"in_iface":"wlan0","event_type":"ssh","src_ip":"192.168.1.222","src_port":59521,"dest_ip":"192.168.1.234","dest_port":22,"proto":"TCP","ssh":{"client":{"proto_version":"2.0","software_version":"WinSCP_release_5.15.9"},"server":{"proto_version":"2.0","software_version":"OpenSSH_7.9p1 Raspbian-10+deb10u2"}}}
{"timestamp":"2020-03-26T10:16:19.435893+0000","flow_id":2061547810825909,"in_iface":"wlan0","event_type":"alert","src_ip":"192.168.1.194","src_port":57621,"dest_ip":"192.168.1.255","dest_port":57621,"proto":"UDP","alert":{"action":"allowed","gid":1,"signature_id":2027397,"rev":1,"signature":"ET POLICY Spotify P2P Client","category":"Not Suspicious Traffic","severity":3,"metadata":{"updated_at":["2019_05_30"],"performance_impact":["Low"],"created_at":["2019_05_30"],"signature_severity":["Minor"],"deployment":["Internal"],"attack_target":["Client_Endpoint"],"affected_product":["Windows_Client_Apps"]}},"app_proto":"failed","flow":{"pkts_toserver":1,"pkts_toclient":0,"bytes_toserver":86,"bytes_toclient":0,"start":"2020-03-26T10:16:19.435893+0000"}}
{"timestamp":"2020-03-26T10:16:21.252753+0000","event_type":"stats","stats":{"uptime":50,"capture":{"kernel_packets":121,"kernel_drops":0,"errors":0},"decoder":{"pkts":121,"bytes":28365,"invalid":0,"ipv4":86,"ipv6":0,"ethernet":121,"raw":0,"null":0,"sll":0,"tcp":49,"udp":37,"sctp":0,"icmpv4":0,"icmpv6":0,"ppp":0,"pppoe":0,"gre":0,"vlan":0,"vlan_qinq":0,"ieee8021ah":0,"teredo":0,"ipv4_in_ipv6":0,"ipv6_in_ipv6":0,"mpls":0,"avg_pkt_size":234,"max_pkt_size":1158,"erspan":0,"ipraw":{"invalid_ip_version":0},"ltnull":{"pkt_too_small":0,"unsupported_type":0},"dce":{"pkt_too_small":0}},"flow":{"memcap":0,"tcp":1,"udp":22,"icmpv4":0,"icmpv6":0,"spare":9999,"emerg_mode_entered":0,"emerg_mode_over":0,"tcp_reuse":0,"memuse":6600064},"defrag":{"ipv4":{"fragments":0,"reassembled":0,"timeouts":0},"ipv6":{"fragments":0,"reassembled":0,"timeouts":0},"max_frag_hits":0},"tcp":{"sessions":1,"ssn_memcap_drop":0,"pseudo":0,"pseudo_failed":0,"invalid_checksum":0,"no_flow":0,"syn":1,"synack":1,"rst":0,"midstream_pickups":0,"pkt_on_wrong_thread":0,"segment_memcap_drop":0,"stream_depth_reached":0,"reassembly_gap":0,"overlap":5,"overlap_diff_data":0,"insert_data_normal_fail":0,"insert_data_overlap_fail":0,"insert_list_fail":0,"memuse":2031616,"reassembly_memuse":294912},"detect":{"engines":[{"id":0,"last_reload":"2020-03-26T10:16:03.882522+0000","rules_loaded":13659,"rules_failed":0}],"alert":1},"app_layer":{"flow":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":1,"imap":0,"msn":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ntp":0,"ftp-data":0,"tftp":0,"ikev2":0,"krb5_tcp":0,"dhcp":0,"failed_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"krb5_udp":0,"failed_udp":22},"tx":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ftp-data":0,"krb5_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"ntp":0,"tftp":0,"ikev2":0,"krb5_udp":0,"dhcp":0},"expectations":0},"flow_mgr":{"closed_pruned":0,"new_pruned":0,"est_pruned":0,"bypassed_pruned":0,"flows_checked":2,"flows_notimeout":2,"flows_timeout":0,"flows_timeout_inuse":0,"flows_removed":0,"rows_checked":65536,"rows_skipped":65534,"rows_empty":0,"rows_busy":0,"rows_maxlen":1},"dns":{"memuse":0,"memcap_state":0,"memcap_global":0},"http":{"memuse":0,"memcap":0},"ftp":{"memuse":0,"memcap":0}}}
{"timestamp":"2020-03-26T10:16:24.972539+0000","flow_id":342276697544443,"in_iface":"wlan0","event_type":"alert","src_ip":"fe80:0000:0000:0000:f286:20ff:fe42:f22b","dest_ip":"ff02:0000:0000:0000:0000:0000:0000:0001","proto":"IPv6-ICMP","icmp_type":134,"icmp_code":0,"alert":{"action":"allowed","gid":1,"signature_id":0,"rev":0,"signature":"RULE TRIGGRED","category":"","severity":3},"flow":{"pkts_toserver":1,"pkts_toclient":0,"bytes_toserver":118,"bytes_toclient":0,"start":"2020-03-26T10:16:24.972539+0000"}}
{"timestamp":"2020-03-26T10:16:24.974956+0000","flow_id":573109714870380,"in_iface":"wlan0","event_type":"alert","src_ip":"fe80:0000:0000:0000:986d:1d1b:673f:b319","dest_ip":"ff02:0000:0000:0000:0000:0000:0000:0001","proto":"IPv6-ICMP","icmp_type":136,"icmp_code":0,"alert":{"action":"allowed","gid":1,"signature_id":0,"rev":0,"signature":"RULE TRIGGRED","category":"","severity":3},"flow":{"pkts_toserver":1,"pkts_toclient":0,"bytes_toserver":86,"bytes_toclient":0,"start":"2020-03-26T10:16:24.974956+0000"}}
{"timestamp":"2020-03-26T10:16:24.998168+0000","flow_id":607027071564568,"in_iface":"wlan0","event_type":"alert","src_ip":"fe80:0000:0000:0000:986d:1d1b:673f:b319","dest_ip":"ff02:0000:0000:0000:0000:0000:0000:0016","proto":"IPv6-ICMP","icmp_type":143,"icmp_code":0,"alert":{"action":"allowed","gid":1,"signature_id":0,"rev":0,"signature":"RULE TRIGGRED","category":"","severity":3},"flow":{"pkts_toserver":1,"pkts_toclient":0,"bytes_toserver":130,"bytes_toclient":0,"start":"2020-03-26T10:16:24.998168+0000"}}
{"timestamp":"2020-03-26T10:16:29.257116+0000","event_type":"stats","stats":{"uptime":58,"capture":{"kernel_packets":181,"kernel_drops":0,"errors":0},"decoder":{"pkts":181,"bytes":45698,"invalid":0,"ipv4":116,"ipv6":8,"ethernet":181,"raw":0,"null":0,"sll":0,"tcp":66,"udp":52,"sctp":0,"icmpv4":0,"icmpv6":6,"ppp":0,"pppoe":0,"gre":0,"vlan":0,"vlan_qinq":0,"ieee8021ah":0,"teredo":0,"ipv4_in_ipv6":0,"ipv6_in_ipv6":0,"mpls":0,"avg_pkt_size":252,"max_pkt_size":1514,"erspan":0,"ipraw":{"invalid_ip_version":0},"ltnull":{"pkt_too_small":0,"unsupported_type":0},"dce":{"pkt_too_small":0}},"flow":{"memcap":0,"tcp":1,"udp":31,"icmpv4":0,"icmpv6":3,"spare":10000,"emerg_mode_entered":0,"emerg_mode_over":0,"tcp_reuse":0,"memuse":6602704},"defrag":{"ipv4":{"fragments":0,"reassembled":0,"timeouts":0},"ipv6":{"fragments":0,"reassembled":0,"timeouts":0},"max_frag_hits":0},"tcp":{"sessions":1,"ssn_memcap_drop":0,"pseudo":0,"pseudo_failed":0,"invalid_checksum":0,"no_flow":0,"syn":1,"synack":1,"rst":0,"midstream_pickups":0,"pkt_on_wrong_thread":0,"segment_memcap_drop":0,"stream_depth_reached":0,"reassembly_gap":0,"overlap":5,"overlap_diff_data":0,"insert_data_normal_fail":0,"insert_data_overlap_fail":0,"insert_list_fail":0,"memuse":2031616,"reassembly_memuse":294912},"detect":{"engines":[{"id":0,"last_reload":"2020-03-26T10:16:03.882522+0000","rules_loaded":13659,"rules_failed":0}],"alert":4},"app_layer":{"flow":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":1,"imap":0,"msn":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ntp":0,"ftp-data":0,"tftp":0,"ikev2":0,"krb5_tcp":0,"dhcp":0,"failed_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"krb5_udp":0,"failed_udp":31},"tx":{"http":0,"ftp":0,"smtp":0,"tls":0,"ssh":0,"smb":0,"dcerpc_tcp":0,"dns_tcp":0,"nfs_tcp":0,"ftp-data":0,"krb5_tcp":0,"dcerpc_udp":0,"dns_udp":0,"nfs_udp":0,"ntp":0,"tftp":0,"ikev2":0,"krb5_udp":0,"dhcp":0},"expectations":0},"flow_mgr":{"closed_pruned":0,"new_pruned":0,"est_pruned":0,"bypassed_pruned":0,"flows_checked":0,"flows_notimeout":0,"flows_timeout":0,"flows_timeout_inuse":0,"flows_removed":0,"rows_checked":65536,"rows_skipped":65536,"rows_empty":0,"rows_busy":0,"rows_maxlen":0},"dns":{"memuse":0,"memcap_state":0,"memcap_global":0},"http":{"memuse":0,"memcap":0},"ftp":{"memuse":0,"memcap":0}}}
我建议在这里使用defauldict
:
data = []
with open('eve.json') as file:
for line in file:
data.append(json.loads(line))
from collections import defaultdict
d = defaultdict(list)
for line in data:
d[line["event_type"]].append(line)
然后分别为每个列表的DataFrame
构造函数用于数据帧列表:
dfs = {k: pd.DataFrame(v) for k, v in d.items()}
print (dfs['alert'])
timestamp flow_id in_iface event_type
0 2020-03-26T10:16:19.435893+0000 2061547810825909 wlan0 alert
1 2020-03-26T10:16:24.972539+0000 342276697544443 wlan0 alert
2 2020-03-26T10:16:24.974956+0000 573109714870380 wlan0 alert
3 2020-03-26T10:16:24.998168+0000 607027071564568 wlan0 alert
src_ip src_port
0 192.168.1.194 57621.0
1 fe80:0000:0000:0000:f286:20ff:fe42:f22b NaN
2 fe80:0000:0000:0000:986d:1d1b:673f:b319 NaN
3 fe80:0000:0000:0000:986d:1d1b:673f:b319 NaN
dest_ip dest_port proto
0 192.168.1.255 57621.0 UDP
1 ff02:0000:0000:0000:0000:0000:0000:0001 NaN IPv6-ICMP
2 ff02:0000:0000:0000:0000:0000:0000:0001 NaN IPv6-ICMP
3 ff02:0000:0000:0000:0000:0000:0000:0016 NaN IPv6-ICMP
alert app_proto
0 {'action': 'allowed', 'gid': 1, 'signature_id'... failed
1 {'action': 'allowed', 'gid': 1, 'signature_id'... NaN
2 {'action': 'allowed', 'gid': 1, 'signature_id'... NaN
3 {'action': 'allowed', 'gid': 1, 'signature_id'... NaN
flow icmp_type icmp_code
0 {'pkts_toserver': 1, 'pkts_toclient': 0, 'byte... NaN NaN
1 {'pkts_toserver': 1, 'pkts_toclient': 0, 'byte... 134.0 0.0
2 {'pkts_toserver': 1, 'pkts_toclient': 0, 'byte... 136.0 0.0
3 {'pkts_toserver': 1, 'pkts_toclient': 0, 'byte... 143.0 0.0
print (dfs['ssh'])
timestamp flow_id in_iface event_type
0 2020-03-26T10:16:14.770482+0000 771992469731822 wlan0 ssh
src_ip src_port dest_ip dest_port proto
0 192.168.1.222 59521 192.168.1.234 22 TCP
ssh
0 {'client': {'proto_version': '2.0', 'software_...
如果想要更多的扁平化数据,请使用json.json_normalize
:
from pandas.io.json import json_normalize
dfs = {k: json_normalize(v) for k, v in d.items()}
print (dfs['alert'])
timestamp flow_id in_iface event_type
0 2020-03-26T10:16:19.435893+0000 2061547810825909 wlan0 alert
1 2020-03-26T10:16:24.972539+0000 342276697544443 wlan0 alert
2 2020-03-26T10:16:24.974956+0000 573109714870380 wlan0 alert
3 2020-03-26T10:16:24.998168+0000 607027071564568 wlan0 alert
src_ip src_port
0 192.168.1.194 57621.0
1 fe80:0000:0000:0000:f286:20ff:fe42:f22b NaN
2 fe80:0000:0000:0000:986d:1d1b:673f:b319 NaN
3 fe80:0000:0000:0000:986d:1d1b:673f:b319 NaN
dest_ip dest_port proto app_proto
0 192.168.1.255 57621.0 UDP failed
1 ff02:0000:0000:0000:0000:0000:0000:0001 NaN IPv6-ICMP NaN
2 ff02:0000:0000:0000:0000:0000:0000:0001 NaN IPv6-ICMP NaN
3 ff02:0000:0000:0000:0000:0000:0000:0016 NaN IPv6-ICMP NaN
alert.metadata.deployment alert.metadata.attack_target
0 ... [Internal] [Client_Endpoint]
1 ... NaN NaN
2 ... NaN NaN
3 ... NaN NaN
alert.metadata.affected_product flow.pkts_toserver flow.pkts_toclient
0 [Windows_Client_Apps] 1 0
1 NaN 1 0
2 NaN 1 0
3 NaN 1 0
flow.bytes_toserver flow.bytes_toclient flow.start
0 86 0 2020-03-26T10:16:19.435893+0000
1 118 0 2020-03-26T10:16:24.972539+0000
2 86 0 2020-03-26T10:16:24.974956+0000
3 130 0 2020-03-26T10:16:24.998168+0000
icmp_type icmp_code
0 NaN NaN
1 134.0 0.0
2 136.0 0.0
3 143.0 0.0
[4 rows x 31 columns]
print (dfs['ssh'])
timestamp flow_id in_iface event_type
0 2020-03-26T10:16:14.770482+0000 771992469731822 wlan0 ssh
src_ip src_port dest_ip dest_port proto
0 192.168.1.222 59521 192.168.1.234 22 TCP
ssh.client.proto_version ssh.client.software_version
0 2.0 WinSCP_release_5.15.9
ssh.server.proto_version ssh.server.software_version
0 2.0 OpenSSH_7.9p1 Raspbian-10+deb10u2