使用python加速从较大的xml文件中提取数据



你好,我不是python的强用户,但需要提取xml文件值。

我正在使用for循环从"xml.dom.minidom.deocument"中获取属性值xyz和temp都使用for循环,因为文件有50万个值,所以需要时间。我尝试使用lxml,但它出现了错误:模块"lxml"没有属性"parse"或"Xpath">

xml文件具有以下格式


<?xml version="1.0" encoding="utf-8"?>
<variable_output>
<!--version      : 1-->
<!--object title : Volume (1)-->
<!--scalar variable : Temperature (TEMP)-->
<POINT>
<Vertex>
<Position x="-0.176300004" y="-0.103100002" z="-0.153699994"/>
<Scalar TEMP="84.192421"/>
</Vertex>
</POINT>
<POINT>
<Vertex>
<Position x="-0.173557162" y="-0.103100002" z="-0.153699994"/>
<Scalar TEMP="83.9050522"/>
</Vertex>
</POINT>
<POINT>
<Vertex>
<Position x="-0.170814306" y="-0.103100002" z="-0.153699994"/>
<Scalar TEMP="83.7506332"/>
</Vertex>
</POINT>
</variable_output>

下面的代码为更大的文件提供了更大的时间。

from xml.dom.minidom import parse
import xml.dom.minidom
import csv
import pandas as pd
import numpy as np
import os
import glob
import time
from lxml import etree
v=[]
doc =parse("document.xml")
Val = doc.getElementsByTagName("Scalar")

t0 = time.time()
for s in Val:
v=np.append(v,float(s.attributes['TEMP'].value))
res=np.array([v])
t1 = time.time()
total = (t1-t0)
print('Time for Value', str(total))
# Using lxml 
doc2=etree.parse("document.xml")

# try using Xpath
t0 = time.time()
temp=doc2.Xpath("/POINT/Vertex/Scaler/@TEMP")
t1 = time.time()
total2 = t1-t0
print('Time for Value', str(total2))
# save data as csv from xml
pd.DataFrame(res.T).to_csv(('Data.csv'),index=False,header=False)   #write timestep as csv

使用Xpath获取Temp或x、y、z值时的错误:

In [12]: temp=doc2.Xpath("/POINT/Vertex/Scaler/@TEMP")
Traceback (most recent call last):
File "<ipython-input-12-bbd832a3074e>", line 1, in <module>
temp=doc2.Xpath("/POINT/Vertex/Scaler/@TEMP")
AttributeError: 'lxml.etree._ElementTree' object has no attribute 'Xpath'

我建议大型xml文件使用iterparse()

import timeit
import os, psutil
import datetime
import pandas as pd
import xml.etree.ElementTree as ET
class parse_xml:
def __init__(self, path):
self.xml = os.path.split(path)[1]
print(self.xml)

columns = ["Pos_x", "Pos_y", "Pos_z", "Scalar_Temp"]

data = []
for event, elem in ET.iterparse(self.xml, events=("end",)):
if elem.tag == "Position":
x = elem.get("x")
y = elem.get("y")
z = elem.get("z")
if elem.tag == "Scalar":
row = (x, y, z , elem.get("TEMP"))
data.append(row)
elem.clear()

df = pd.DataFrame(data, columns=columns)
print(df)

def main():
xml_file = r"D:DatenProgrammierenstackoverflowdocument.xml"
parse_xml(xml_file)
if __name__ == "__main__":
now = datetime.datetime.now()
starttime = timeit.default_timer()
main()
process = psutil.Process(os.getpid())
print('nFinished')
print(f"{now:%Y-%m-%d %H:%M}")
print('Runtime:', timeit.default_timer()-starttime)
print(f'RAM: {process.memory_info().rss/1000**2} MB')

输出:

document.xml
Pos_x         Pos_y         Pos_z Scalar_Temp
0  -0.176300004  -0.103100002  -0.153699994   84.192421
1  -0.173557162  -0.103100002  -0.153699994  83.9050522
2  -0.170814306  -0.103100002  -0.153699994  83.7506332
Finished
2022-11-29 23:51
Runtime: 0.007375300000000029
RAM: 55.619584 MB

如果输出太大,可以将其写入带有df.to_sql()的sqlite3数据库。

最新更新