将大型XML文件转换为DataFrame


def intr_docs(xml_doc):

attr = xml_doc.attrib
for chld in root:

for value in chld:

for xml in xml_doc.iter('timestep'):
doc_dict = attr.copy()
doc_dict.update(xml.attrib)
doc_dict['time'] = chld.attrib['time']             
doc_dict['id'] = value.attrib['id']
doc_dict['speed'] = value.attrib['speed']
doc_dict['lane'] = value.attrib['lane']
yield doc_dict

doc_df = pd.DataFrame(list(intr_docs(root)))

<timestep time="28800.00"/>
<timestep time="28890.00">
<vehicle id="800002" x="5534.41" y="3530.10" angle="243.59" type="HV" speed="14.58" pos="5.10" lane="-gneE13_2" slope="0.00"/>
<vehicle id="800003" x="4190.43" y="3359.53" angle="88.95" type="HV" speed="12.64" pos="5.10" lane="-5088609#2_0" slope="0.00"/>
<vehicle id="800017" x="1972.35" y="437.35" angle="306.80" type="HV" speed="15.17" pos="5.10" lane="-gneE5_2" slope="0.00"/>
<vehicle id="800021" x="9.34" y="-1.68" angle="42.62" type="HV" speed="13.32" pos="5.10" lane="-gneE6_1" slope="0.00"/>
<vehicle id="800034" x="2616.22" y="1599.61" angle="318.00" type="HV" speed="14.54" pos="5.10" lane="-gneE2_0" slope="0.00"/>
<vehicle id="800053" x="2915.19" y="2618.33" angle="290.01" type="HV" speed="16.52" pos="5.10" lane="-6200994#2_0" slope="0.00"/>
</timestep>

我是python的新手。我有XML文件(附加图像)1,并希望将其转换为数据框(附加图像)2。我得到了代码并转换了我想要的文件(13 MB),但在一个大文件(超过500 MB)的情况下,它需要很多时间。我想知道这是由于代码或我的PC容量。那么我该怎么做呢?

可能默认的read_xml()方法会执行得更好:

import pandas as pd
df = pd.read_xml(xml, xpath="//vehicle", attrs_only=True)

使用xpath参数告诉pandas使用每个vehicle元素作为数据框行,attrs_only参数告诉pandas使用属性作为行字段。这种方法将采用vehicle元素的所有属性,不必要的字段可以在数据框创建后删除。

使用SAX解析器的解决方案:
import xml.sax
from collections import defaultdict
import pandas as pd
# class handling SAX events
class TimestepHandler(xml.sax.ContentHandler):
# dictionary containing data for dataframe
data = defaultdict(list)
def __init__(self):
self.data = defaultdict(list)
self.time = None
# start element event handler
def startElement(self, tag, attributes):
# on 'timestep' element save time value
if tag == 'timestep':
self.time = attributes['time']
# on 'vehicle' element add attribute values to the dictionary
# reusing 'time' value from last 'timestep' element
if tag == 'vehicle':
self.data['time'].append(self.time)
self.data['id'].append(attributes['id'])
self.data['x'].append(attributes['x'])
self.data['y'].append(attributes['y'])
self.data['angle'].append(attributes['angle'])
self.data['type'].append(attributes['type'])
self.data['speed'].append(attributes['speed'])
self.data['pos'].append(attributes['pos'])
self.data['lane'].append(attributes['lane'])
self.data['slope'].append(attributes['slope'])
# create SAX parser with custom handler   
parser = xml.sax.make_parser()
handler = TimestepHandler()
parser.setContentHandler(handler)
# parse XML file 
parser.parse("file.xml")
# create pandas dataframe from dictionary
df = pd.DataFrame(handler.data)
df

使用lxml:

解决方案
import pandas as pd
from lxml import etree
from collections import defaultdict
d = defaultdict(list)
doc = etree.parse('file.xml')
# retrieve 'vehicle' elements using xpath
vehicles = doc.xpath('.//vehicle')
# iterate over 'vehicle' elements
for v in vehicles:
d['id'].append(v.get('id'))
d['time'].append(v.xpath('../@time')[0])
d['x'].append(v.get('x'))
d['y'].append(v.get('y'))
d['angle'].append(v.get('angle'))
d['type'].append(v.get('type'))
d['speed'].append(v.get('speed'))
d['pos'].append(v.get('pos'))
d['lane'].append(v.get('lane'))
d['slope'].append(v.get('slope'))
# create pandas dataframe from dictionary
df = pd.DataFrame(d)
df

最新更新