XML和可能的LXML问题



我有很多这样的XML文件

<?xml version="1.0" encoding="utf-8" standalone="no"?>
<reiXmlPrenos>
<Qfl>1808</Qfl>
<fOVE>13.7</fOVE>
<NetoVolumen>613</NetoVolumen>
<Hv>104.2</Hv>
<energenti>
<energent>
<sifra>energy_e</sifra>
<naziv>EE [kWh]</naziv>
<vrednost>238981</vrednost>
</energent>
<energent>
<sifra>energy_to</sifra>
<naziv>Do</naziv>
<vrednost>16359</vrednost>
</energent>
<energent>
<sifra>energy_en</sifra>
<naziv>En</naziv>
<vrednost>0</vrednost>
</energent>
</energenti>
<rei>
<zavetrovanost>2</zavetrovanost>
<cone>
<cona>
<cona_id>1</cona_id>
<cc_si_cona>1110000</cc_si_cona>
<visina_cone>2.7</visina_cone>
<dolzina_cone>14</dolzina_cone>
</cona>
<cona>
<cona_id>2</cona_id>
<cc_si_cona>120000</cc_si_cona>
</cona>
</rei>
</reiXmlPrenos>

我想从这些XML文件中提取某些值。所以我在这里的人的帮助下,把下面的代码放在一起,应该可以工作:

import pandas as pd
import glob
import os
from lxml import etree

os.chdir(r'R:...XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path

#note: For simplicity, I'm using the well formed version of the xml strings in your question; you'll have to use actual file names and paths
energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
#I just made up some names - you'll have to use actual names, of course; the first one is for the file identifier - see below
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
#this creates the file identifier
row.append(id)
root = etree.XML(xml.encode())
#in real life, you'll have to use the parse() method

for energy in energies[1:]:
#the '[1:]' is used to skip the first "energy"; it's only used as the file identifier
target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
#note the use of f-strings
row.extend( target if len(target)>0 else "0" )
rows.append(row)
print(pd.DataFrame(rows,columns=energies))

但是最后我得到了一个警告:

File "<string>", line 1
XMLSyntaxError: Start tag expected, '<' not found, line 1, column 1

这是XML问题吗?或者可能是lxml问题?有人知道怎么处理这个吗?

理想情况下,结果看起来像这样

xml       energy_e   energy_en   energy_to
xml-1    238981      0         16539 
xml-2    ...         ..        .. 

由于您正在寻找数据框架,您可以简单地使用read_xmlfrom pandas:

df = (
pd.read_xml(xml, xpath=".//energent")
.drop("naziv", axis=1)
.set_index("sifra").T
.rename_axis(None, axis=1)
)

你可以这样把它合并到你的代码中:

xmls = glob.glob("*.xml")
list_dfs = []
for idx, xml in enumerate(xmls, start=1):
tmp_df = (
pd.read_xml(xml, xpath=".//energent")
.drop("naziv", axis=1)
.set_index("sifra").T
.rename_axis(None, axis=1)
)
tmp_df.insert(0, "xml", f"{xml}-{idx}")
list_dfs.append(tmp_df)

df = pd.concat(list_dfs, ignore_index=True)

测试/输出(x3相同的xml):

print(df) 
xml  energy_e  energy_to  energy_en
0   first.xml-1    238981      16359          0
1  second.xml-2    238981      16359          0
2   third.xml-3    238981      16359          0
import pandas as pd
import glob
import os
from lxml import etree

os.chdir(r'R:...XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path
energies = ["xml", "energy_e", "energy_en", "energy_to"]
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
row.append(id)
with open(xml, 'r', encoding='utf-8') as f:
xml_string = f.read()
root = etree.XML(xml_string.encode())

for energy in energies[1:]:
target = root.xpath(f'//energent[./sifra="{energy}"]/vrednost/text()')
row.extend(target if len(target)>0 else ["0"])
rows.append(row)
print(pd.DataFrame(rows, columns=energies)

下面的解析

import pandas as pd
import glob
import os
from lxml import etree

os.chdir(r'R:...XML-1122_test')
dir_path = glob.glob('*.xml')
xmls = dir_path

energies = ["xml", "energy_ge", "energy_en", "energy_dteu", "energy_dtlb"]
rows = []
for xml in xmls:
row = []
id = "xml-"+str(xmls.index(xml)+1)
row.append(id)
root = etree.parse(xml)

for energy in energies[1:]:
target = root.xpath(f'//energent[./sifra[.="{energy}"]]/vrednost/text()')
row.extend( target if len(target)>0 else "0" )
rows.append(row)
print(pd.DataFrame(rows,columns=energies))

最新更新