Python:在读取文件时创建各种文件对象

我正在读取一个包含各种<xml>..</xml>元素的大文件。由于每个XML解析器都有这样的问题，所以我希望为每个<xml>..</xml>块高效地生成新的文件对象。

我开始在Python中对文件对象进行子类化，但在那里被卡住了。我想，我必须截取以</xml>开头的每一行，并返回一个新的文件对象；可能通过使用CCD_ 4。

有人能引导我朝着正确的方向迈出这一步吗？

这是我当前的代码片段：

#!/bin/bash/env python
from lxml import etree
from StringIO import StringIO
class handler(file):
  def __init__(self, name, mode):
    file.__init__(self, name, mode)
  def next(self):
    return file.next(self)
  def listXmls(self):
    output = StringIO()
    line = self.next()
    while line is not None:
      output.write(line.strip())
      if line.strip() == '</xml>':
        yield output
        output = StringIO()
      try:
        line = self.next()
      except StopIteration:
        break
    output.close()
f = handler('myxml.xml', 'r')
for elem in f.listXmls():
  print 'm' + elem.getvalue() + 'm'
  context = etree.iterparse(elem, events=('end',), tag='id')
  for event, element in context:
    print element.tag

谢谢！

解决方案（仍然对更好的版本感兴趣）：

#!/bin/bash/env python
from lxml import etree
from StringIO import StringIO
class handler(file):
  def __init__(self, name, mode):
    file.__init__(self, name, mode)
  def next(self):
    return file.next(self)
  def listXmls(self):
    output = StringIO()
    output.write(self.next())
    line = self.next()
    while line is not None:
      if line.startswith('<?xml'):
        output.seek(0)
        yield output
        output = StringIO()
      output.write(line)
      try:
        line = self.next()
      except StopIteration:
        break
    output.seek(0)
    yield output
f = handler('myxml.xml', 'r')
for elem in f.listXmls():
  context = etree.iterparse(elem, events=('end',), tag='id')
  for event, element in context:
    print element.tag

虽然这不能直接回答您的问题，但无论如何都可以解决您的问题：只需在开头添加另一个<xml>，在结尾添加另一个子</xml>，就可能使您的XML解析器接受文档：

from lxml import etree
document = "<xml>a</xml> <xml>b</xml>"
document = "<xml>" + document + "</xml>"
for subdocument in etree.XML(document):
    # whatever

相关内容

最新更新

热门标签：