使用具有多个标记的Pandas将XML转换为CSV



需要使用python(Pandas(或任何其他库将XML文件展平为CSV。我在下面有多个子标签,还有一个要跨子元素复制的头。这是我用Python编写的第一个程序,但无法成功执行。感谢这里的任何方向

-输入数据的预期输出

chName,envText,rptCod,rptNam,membId,membLglNam,rptPrntEffDat,rptPrntRunDat,membClgIdCod,membClgIdNam,clgCurrTypCod,poolId,currTypCod,membChgIdCod,membChgIdNam,acctTypFlexible,prtMgnUnadj,prtMgnReqt,ChgRat,totMgnClgCurr,sumChgMbrPrtgbReq,sumChgMbrClgCurr,sumPoolIdTotal,sumPoolIdClgCurr,sumClgMbrTotMgnClgCurr
DEBSIL,S,PTR001,DD01 Report,POP,ABC LT,08/04/2021,08/04/2021,UVBV,YEZZE,USD,POPXCLIENTECM,USD,POP123,ABC LT123,A1,5656.6,5634.6,1,34543.6,5656.6,5634.6,5656.6,5634.6,700090
DEBSIL,S,PTR001,DD01 Report,POP,ABC LT,08/04/2021,08/04/2021,UVBV,YEZZE,USD,POPXCLIENTNET,NZD,POP676,SEEN TD,A9,8989.9,45656.9,1,1780688.9,8989.9,45656.9,8989.9,45656.9,700090
DEBSIL,S,PTR001,DD01 Report,POP,ABC LT,08/04/2021,08/04/2021,UVBV,YEZZE,USD,POPXSTANDARD,MXM,POP343,ACMI,P1,345.5,4545.5,1,4545.5,345.5,4545.5,345.5,4545.5,700090

输入数据

<PTR001>
<rptHdr>
<chName>DEBSIL</chName>
<envText>S</envText>
<rptCod>PTR001</rptCod>
<rptNam>DD01 Report</rptNam>
<membId>POP</membId>
<membLglNam>ABC LT</membLglNam>
<rptPrntEffDat>2021-04-08</rptPrntEffDat>
<rptPrntRunDat>2021-04-08</rptPrntRunDat>
</rptHdr>
<ptr001Grp>
<ptr001KeyGrp>
<membClgIdCod>UVBV</membClgIdCod>
<membClgIdNam>YEZZE</membClgIdNam>
</ptr001KeyGrp>
<clgCurrTypCod>USD</clgCurrTypCod>
<ptr001Grp1>
<ptr001KeyGrp1>
<poolId>CSBXCLIENTECM</poolId>
<currTypCod>USD</currTypCod>
</ptr001KeyGrp1>
<ptr001Grp3>
<ptr001KeyGrp3>
<membChgIdCod>POP123</membChgIdCod>
<membChgIdNam>ABC LT123</membChgIdNam>
</ptr001KeyGrp3>
<ptr001Rec>
<acctTypFlexible>A1</acctTypFlexible>
<prtMgnUnadj>5656.60</prtMgnUnadj>
<prtMgnReqt>5634.60</prtMgnReqt>
<ChgRat>1.000000</ChgRat>
<totMgnClgCurr>34543.60</totMgnClgCurr>
</ptr001Rec>
<sumChgMbrPrtMgbReq>5656.60</sumChgMbrPrtMgbReq>
<sumChgMbrClgCurr>5634.60</sumChgMbrClgCurr>
</ptr001Grp3>
<sumPoolIdTotal>5656.60</sumPoolIdTotal>
<sumPoolIdClgCurr>5634.60</sumPoolIdClgCurr>
</ptr001Grp1>
<ptr001Grp1>
<ptr001KeyGrp1>
<poolId>POPXCLIENTNET</poolId>
<currTypCod>NZD</currTypCod>
</ptr001KeyGrp1>
<ptr001Grp3>
<ptr001KeyGrp3>
<membChgIdCod>POP676</membChgIdCod>
<membChgIdNam>SEEN TD</membChgIdNam>
</ptr001KeyGrp3>
<ptr001Rec>
<acctTypFlexible>A9</acctTypFlexible>
<prtMgnUnadj>8989.90</prtMgnUnadj>
<prtMgnReqt>45656.90</prtMgnReqt>
<ChgRat>1.000000</ChgRat>
<totMgnClgCurr>1780688.90</totMgnClgCurr>
</ptr001Rec>
<sumChgMbrPrtMgbReq>8989.90</sumChgMbrPrtMgbReq>
<sumChgMbrClgCurr>45656.90</sumChgMbrClgCurr>
</ptr001Grp3>
<sumPoolIdTotal>8989.90</sumPoolIdTotal>
<sumPoolIdClgCurr>45656.90</sumPoolIdClgCurr>
</ptr001Grp1>
<ptr001Grp1>
<ptr001KeyGrp1>
<poolId>POPXSTANDARD</poolId>
<currTypCod>MXM</currTypCod>
</ptr001KeyGrp1>
<ptr001Grp3>
<ptr001KeyGrp3>
<membChgIdCod>POP343</membChgIdCod>
<membChgIdNam>ACMI</membChgIdNam>
</ptr001KeyGrp3>
<ptr001Rec>
<acctTypFlexible>P1</acctTypFlexible>
<prtMgnUnadj>345.50</prtMgnUnadj>
<prtMgnReqt>4545.50</prtMgnReqt>
<ChgRat>1.000000</ChgRat>
<totMgnClgCurr>4545.50</totMgnClgCurr>
</ptr001Rec>
<sumChgMbrPrtMgbReq>345.50</sumChgMbrPrtMgbReq>
<sumChgMbrClgCurr>4545.50</sumChgMbrClgCurr>
</ptr001Grp3>
<sumPoolIdTotal>345.50</sumPoolIdTotal>
<sumPoolIdClgCurr>4545.50</sumPoolIdClgCurr>
</ptr001Grp1>
<sumClgMbrTotMgnClgCurr>700090.00</sumClgMbrTotMgnClgCurr>
</ptr001Grp>
</PTR001>
# Importing the required libraries
import xml.etree.ElementTree as Xet
import pandas as pd

cols = ["poolId", "currTypCod", "membChgIdCod", "membChgIdNam", "acctTypFlexible", "prtMgnReqt", "prtMgnReqt", "ChgRat", "totMgnClgCurr", "sumChgMbrPrtgbReq", "sumChgMbrClgCurr", "sumPoolIdTotal", "sumPoolIdClgCurr"]
rows = []

# Parsing the XML file
xmlparse = Xet.parse('myReport.xml')
root = xmlparse.getroot()
for i in root:
poolId = i.find("poolId").text
currTypCod = i.find("currTypCod").text
membChgIdCod = i.find("membChgIdCod").text
membChgIdNam = i.find("membChgIdNam").text
acctTypFlexible = i.find("acctTypFlexible").text
prtMgnReqt = i.find("prtMgnReqt").text
ChgRat = i.find("ChgRat").text
totMgnClgCurr = i.find("totMgnClgCurr").text
sumChgMbrPrtgbReq = i.find("sumChgMbrPrtgbReq").text
sumChgMbrClgCurr = i.find("sumChgMbrClgCurr").text
sumPoolIdTotal = i.find("sumPoolIdTotal").text
sumPoolIdClgCurr = i.find("sumPoolIdClgCurr").text

rows.append({"poolId": poolId,
"currTypCod": currTypCod,
"membChgIdCod": membChgIdCod,
"membChgIdNam": membChgIdNam,
"acctTypFlexible": acctTypFlexible,
"prtMgnReqt": prtMgnReqt,
"ChgRat": ChgRat,
"totMgnClgCurr": totMgnClgCurr,
"sumChgMbrPrtgbReq":  sumChgMbrPrtgbReq,
"sumChgMbrClgCurr": sumChgMbrClgCurr,
"sumPoolIdTotal": sumPoolIdTotal,
"sumPoolIdClgCurr": sumPoolIdClgCurr})

df = pd.DataFrame(rows, columns=cols)

# Writing dataframe to csv
df.to_csv('myReport.csv')

我将把CSV列的排序留给您来解决。在SO中搜索应该会产生许多帖子。

from xml.etree import ElementTree as ET
import pandas as pd
data = '''
<PTR001>
<rptHdr>
<chName>DEBSIL</chName>
<envText>S</envText>
<rptCod>PTR001</rptCod>
<rptNam>DD01 Report</rptNam>
<membId>POP</membId>
<membLglNam>ABC LT</membLglNam>
<rptPrntEffDat>2021-04-08</rptPrntEffDat>
<rptPrntRunDat>2021-04-08</rptPrntRunDat>
</rptHdr>
<ptr001Grp>
<ptr001KeyGrp>
<membClgIdCod>UVBV</membClgIdCod>
<membClgIdNam>YEZZE</membClgIdNam>
</ptr001KeyGrp>
<clgCurrTypCod>USD</clgCurrTypCod>
<ptr001Grp1>
<ptr001KeyGrp1>
<poolId>CSBXCLIENTECM</poolId>
<currTypCod>USD</currTypCod>
</ptr001KeyGrp1>
<ptr001Grp3>
<ptr001KeyGrp3>
<membChgIdCod>POP123</membChgIdCod>
<membChgIdNam>ABC LT123</membChgIdNam>
</ptr001KeyGrp3>
<ptr001Rec>
<acctTypFlexible>A1</acctTypFlexible>
<prtMgnUnadj>5656.60</prtMgnUnadj>
<prtMgnReqt>5634.60</prtMgnReqt>
<ChgRat>1.000000</ChgRat>
<totMgnClgCurr>34543.60</totMgnClgCurr>
</ptr001Rec>
<sumChgMbrPrtMgbReq>5656.60</sumChgMbrPrtMgbReq>
<sumChgMbrClgCurr>5634.60</sumChgMbrClgCurr>
</ptr001Grp3>
<sumPoolIdTotal>5656.60</sumPoolIdTotal>
<sumPoolIdClgCurr>5634.60</sumPoolIdClgCurr>
</ptr001Grp1>
<ptr001Grp1>
<ptr001KeyGrp1>
<poolId>POPXCLIENTNET</poolId>
<currTypCod>NZD</currTypCod>
</ptr001KeyGrp1>
<ptr001Grp3>
<ptr001KeyGrp3>
<membChgIdCod>POP676</membChgIdCod>
<membChgIdNam>SEEN TD</membChgIdNam>
</ptr001KeyGrp3>
<ptr001Rec>
<acctTypFlexible>A9</acctTypFlexible>
<prtMgnUnadj>8989.90</prtMgnUnadj>
<prtMgnReqt>45656.90</prtMgnReqt>
<ChgRat>1.000000</ChgRat>
<totMgnClgCurr>1780688.90</totMgnClgCurr>
</ptr001Rec>
<sumChgMbrPrtMgbReq>8989.90</sumChgMbrPrtMgbReq>
<sumChgMbrClgCurr>45656.90</sumChgMbrClgCurr>
</ptr001Grp3>
<sumPoolIdTotal>8989.90</sumPoolIdTotal>
<sumPoolIdClgCurr>45656.90</sumPoolIdClgCurr>
</ptr001Grp1>
<ptr001Grp1>
<ptr001KeyGrp1>
<poolId>POPXSTANDARD</poolId>
<currTypCod>MXM</currTypCod>
</ptr001KeyGrp1>
<ptr001Grp3>
<ptr001KeyGrp3>
<membChgIdCod>POP343</membChgIdCod>
<membChgIdNam>ACMI</membChgIdNam>
</ptr001KeyGrp3>
<ptr001Rec>
<acctTypFlexible>P1</acctTypFlexible>
<prtMgnUnadj>345.50</prtMgnUnadj>
<prtMgnReqt>4545.50</prtMgnReqt>
<ChgRat>1.000000</ChgRat>
<totMgnClgCurr>4545.50</totMgnClgCurr>
</ptr001Rec>
<sumChgMbrPrtMgbReq>345.50</sumChgMbrPrtMgbReq>
<sumChgMbrClgCurr>4545.50</sumChgMbrClgCurr>
</ptr001Grp3>
<sumPoolIdTotal>345.50</sumPoolIdTotal>
<sumPoolIdClgCurr>4545.50</sumPoolIdClgCurr>
</ptr001Grp1>
<sumClgMbrTotMgnClgCurr>700090.00</sumClgMbrTotMgnClgCurr>
</ptr001Grp>
</PTR001>
'''
tree = ET.fromstring(data)
header = tree.find('rptHdr')
hd = {}
for c in header:
hd[c.tag] = c.text.strip() if c.text else ''
group = tree.find('ptr001Grp')
gpaths = ['ptr001KeyGrp/*', 'clgCurrTypCod', 'sumClgMbrTotMgnClgCurr']
gd = {}
for x in gpaths:
for c in group.findall(x):
gd[c.tag] = c.text.strip() if c.text else ''
rows = []
dpaths = [
'ptr001KeyGrp1/poolId', 'ptr001KeyGrp1/currTypCod',
'ptr001Grp3/ptr001KeyGrp3/*', 'ptr001Grp3/ptr001Rec/*',
'ptr001Grp3/sumChgMbrPrtMgbReq', 'ptr001Grp3/sumChgMbrClgCurr'
]
for e in group.findall('ptr001Grp1'):
d = {}
for x in dpaths:
for c in e.findall(x):
d[c.tag] = c.text.strip() if c.text else ''
d.update(gd)
d.update(hd)
rows.append(d)
df = pd.DataFrame(rows)
print(df[['poolId', 'currTypCod', 'sumChgMbrPrtMgbReq', 'sumChgMbrClgCurr']])
import sys
df.to_csv(sys.stdout, index=False)

相关内容

  • 没有找到相关文章

最新更新