如何使用XML SAX解析器读写大型XML



我正在尝试从下面的示例xml文档(原始文档约为30 GB )中删除所有project1节点(以及其子元素)sax parser.-可以使用一个单独的修改文件或在线编辑中确定。

可以使用。

sample.xml

<ROOT>
    <test src="http://dfs.com">Hi</test>
    <project1>This is old data<foo></foo></project1>
    <bar>
        <project1>ty</project1>
        <foo></foo>
    </bar>
</ROOT>

这是我的尝试..

parser.py

from xml.sax.handler import ContentHandler
import xml.sax
class MyHandler(xml.sax.handler.ContentHandler):
    def __init__(self, out_file):
        self._charBuffer = []
        self._result = []
        self._out = open(out_file, 'w')
    def _createElement(self, name, attrs):
        attributes = attrs.items()
        if attributes:
            out = ''
            for key, value in attributes:
                out += ' {}={}'.format(key, value)
            return '<{}{}>'.format(name, out)
        return '<{}>'.format(name)

    def _getCharacterData(self):
        data = ''.join(self._charBuffer).strip()
        self._charBuffer = []
        self._out.write(data.strip()) #remove strip() if whitespace is important
    def parse(self, f):
        xml.sax.parse(f, self)
    def characters(self, data):
        self._charBuffer.append(data)
    def startElement(self, name, attrs):
        if not name == 'project1': 
            self._result.append({})
            self._out.write(self._createElement(name, attrs))
    def endElement(self, name):
        if not name == 'project1': self._result[-1][name] = self._getCharacterData()
MyHandler('out.xml').parse("sample.xml")

我无法正常工作。

您可以使用xml.sax.saxutils.XMLFilterBase实现来过滤您的Project1节点。

而不是自己组装XML字符串,您可以使用xml.sax.saxutils.XMLGenerator

以下是Python3代码,如果需要Python2,请调整super

from xml.sax import make_parser
from xml.sax.saxutils import XMLFilterBase, XMLGenerator

class Project1Filter(XMLFilterBase):
    """This decides which SAX events to forward to the ContentHandler
    We will not forward events when we are inside any elements with a
    name specified in the 'tags_names_to_exclude' parameter
    """
    def __init__(self, tag_names_to_exclude, parent=None):
        super().__init__(parent)
        # set of tag names to exclude
        self._tag_names_to_exclude = tag_names_to_exclude
        # _project_1_count keeps track of opened project1 elements
        self._project_1_count = 0
    def _forward_events(self):
        # will return True when we are not inside a project1 element
        return self._project_1_count == 0
    def startElement(self, name, attrs):
        if name in self._tag_names_to_exclude:
            self._project_1_count += 1
        if self._forward_events():
            super().startElement(name, attrs)
    def endElement(self, name):
        if self._forward_events():
            super().endElement(name)
        if name in self._tag_names_to_exclude:
            self._project_1_count -= 1
    def characters(self, content):
        if self._forward_events():
            super().characters(content)
    # override other content handler methods on XMLFilterBase as neccessary

def main():
    tag_names_to_exclude = {'project1', 'project2', 'project3'}
    reader = Project1Filter(tag_names_to_exclude, make_parser())
    with open('out-small.xml', 'w') as f:
        handler = XMLGenerator(f)
        reader.setContentHandler(handler)
        reader.parse('input.xml')

if __name__ == "__main__":
    main()

最新更新