如何使用xml.sax python解析具有不同元素的xml文件?



我希望使用python解析大型xml文件。该文件包含几个不同的元素。以下是我正在处理的文件示例:

<movie title="Searching">
<type>War, Thriller</type>
<format>DVD</format>
<year>2018</year>
<rating>PG</rating>
<stars>10</stars>
<description>Missing girl</description>
</movie>
<song title="Need You Now">
<type>Pop, Country</type>
<format>MP4</format>
<year>2010</year>
<singer>Lady Antebellum</rating>
<stars>8</stars>
</song>
<movie title="Sidewalk of New York">
<type>Romantic comedy</type>
<format>DVD</format>
<rating>PG</rating>
<stars>7</stars>
<description>Stories of people's perspective of love in NY</description>
</movie>

从上面的文件中,我想解析并存储与要列出的电影和歌曲元素相关的所有信息。我正在使用 xml.sax 库,但在如何区分元素方面遇到了问题。例如,如果我解析类型、格式和年份标签,它无法区分它属于电影还是歌曲。这是我到目前为止使用的代码片段:

import psycopg2
import xml.sax
from xml.sax.handler import ContentHandler
class MovieHandler( xml.sax.ContentHandler ):
def __init__(self):
self.CurrentData = ""
self.type = ""
self.format = ""
self.year = ""
self.data = {} #dict
self.list = [] #list to store information
self.list2 = []
self.list3 = []
self.list4 = []
self.list5 = []
# Call when an element starts
def startElement(self, tag, attributes):
self.CurrentData = tag
if tag == "movie":
title = attributes["title"]
self.list.append(title)
self.data['title'] = self.list
elif tag == "song":
title = attributes ["title"]
self.list2.append(title)
self.data['song_title'] = self.list2
# Call when an elements ends
def endElement(self, tag):
if self.CurrentData == "type":
type = self.type
self.list3.append(type)
self.data['type'] = self.list3
elif self.CurrentData == "format":
format = self.format
self.list4.append(format)
self.data['format'] = self.list4
elif self.CurrentData == "year":
year = int(self.year)
self.list5.append(year)
self.data['year'] = self.list5
self.CurrentData = ""
# Call when a character is read
def characters(self, content):
if self.CurrentData == "type":
self.type = content
elif self.CurrentData == "format":
self.format = content
elif self.CurrentData == "year":
self.year = content

上述代码的结果是类型、格式和年份将被重复计算。下面是输出的示例:

{'format': ['DVD', 'MP4', 'DVD'],
'title': ['Searching', 'Need You Now', 'Sidewalk of New York'],
'type': ['War, Thriller',
'Pop, Country',
'Romantic Comedy'],
'year': [2018, 2010]}

关于如何解决这个问题的任何想法?谢谢!

您可以将当前类型保存在startElement中,然后可以将数据存储在endElement()

def startElement(self, tag, attributes):
if tag == "movie":
self.currentEntryType = "movie"
elif tag == "song":
self.currentEntryType = "song"
...
def endElement(self, tag):
if self.currentEntryType == "movie":
... # store as movie data

(我不太确定你的输出列表应该是什么样子,也许你可以为示例 xml 发布完美的输出列表(

编辑: 有 2 个词典会更好吗,一个用于电影,一个用于歌曲?这取决于数据库的外观。

我不确定你为什么要列表,使用字典列表而不是列表字典不是更好吗?

例如使用 2 个字典,如下所示:

{   
'format': ['DVD', 'DVD'],
'title': ['Searching', 'Sidewalk of New York'],
'type': ['War, Thriller', 'Romantic Comedy'],
'year': [2018]
}
{   
'format': ['MP4'],
'title': ['Need You Now'],
'type': ['Pop, Country'],
'year': [2010]
}

问题是,您无法决定2018年属于搜索还是纽约人行道。当您将列表提交到数据库时,这会是一个问题吗?

这是一个包含两个词典的解决方案,一个用于电影,一个用于歌曲。 基本上,它使用 8 个列表而不是 5 个列表来单独存储所有内容。

class MovieHandler( xml.sax.ContentHandler ):
def __init__(self):
self.CurrentData = ""
self.type = ""
self.format = ""
self.year = ""
self.dataMovies = {} #dict
self.dataSongs = {}
self.list = [] # movie title
self.list2 = [] # song title
self.list3 = [] # movie type
self.list4 = [] # movie format
self.list5 = [] # movie year
self.list6 = [] # song type
self.list7 = [] # song format
self.list8 = [] # song year
self.currentEntryType = None
# Call when an element starts
def startElement(self, tag, attributes):
self.CurrentData = tag
if tag == "movie":
title = attributes["title"]
self.list.append(title)
self.dataMovies['title'] = self.list
self.currentEntryType = "movie"
elif tag == "song":
title = attributes ["title"]
self.list2.append(title)
self.dataSongs['song_title'] = self.list2
self.currentEntryType = "song"
# Call when an elements ends
def endElement(self, tag):
if tag == "movie":
# Make movie lists the same length
self.list3.extend([None]*(len(self.list)-len(self.list3)))
self.list4.extend([None]*(len(self.list)-len(self.list4)))
self.list5.extend([None]*(len(self.list)-len(self.list5)))   
elif tag == "song":
# Make movie lists the same length
self.list6.extend([None]*(len(self.list2)-len(self.list6)))
self.list7.extend([None]*(len(self.list2)-len(self.list7)))
self.list8.extend([None]*(len(self.list2)-len(self.list8))) 
if self.currentEntryType == "movie":
if self.CurrentData == "type":
type = self.type
self.list3.append(type)
self.dataMovies['type'] = self.list3
elif self.CurrentData == "format":
format = self.format
self.list4.append(format)
self.dataMovies['format'] = self.list4
elif self.CurrentData == "year":
year = int(self.year)
self.list5.append(year)
self.dataMovies['year'] = self.list5
self.CurrentData = ""
elif self.currentEntryType == "song":
if self.CurrentData == "type":
type = self.type
self.list6.append(type)
self.dataSongs['type'] = self.list6
elif self.CurrentData == "format":
format = self.format
self.list7.append(format)
self.dataSongs['format'] = self.list7
elif self.CurrentData == "year":
year = int(self.year)
self.list8.append(year)
self.dataSongs['year'] = self.list8
self.CurrentData = ""
# Call when a character is read
def characters(self, content):
if self.CurrentData == "type":
self.type = content
elif self.CurrentData == "format":
self.format = content
elif self.CurrentData == "year":
self.year = content

最新更新