在 Java 中解析没有 DOM 的高度嵌套 XML



我的任务是修复一个相当烦人的堆内存不足问题。IBM 提供了一个与 Java 一起使用的 Cognos SDK,我们查询存储在内容存储中的所有包,这些包以 xml 格式返回。然后我们解析该 xml 并将其写入 sql 数据库。分析显示,最严重的内存问题是由 Char[] 引起的,它不是很有帮助(而且堆太大以至于很难分析(,但确实指向 DOM 解析器。

我们说的是 500-1500 个 xml 文件(从技术上讲,XML 文本流(,这些文件嵌套得非常深,大小不一,偶尔结构也不尽相同。大小从几KB到30 MB不等,程序在大约300个软件包后将消耗8 GB以上的内存。我之前的程序员通过在每次 xml 解析后进行手动 System.gc 调用来处理这个问题,我希望摆脱它(它实际上也没有解决问题,只是让它在最小的 500 包服务器上可行(。

我尝试使用 JAXB,但它有一个奇怪的结构,使得它很难在这里使用(它有一些"文件夹或查询主题"的事情发生(。上周我尝试了几个小时的STAX,但无法完全开始工作,WoodStox也是如此。我真的找不到关于这样做的示例或教程。JDOM 是我接下来检查的内容(因为我读到它比纯 DOM 具有更好的内存处理(,但我无法弄清楚如何让它像 DOM 一样深入解析。 当前 DOM 解析:

is = new ByteArrayInputStream(xml.getBytes("UTF-8"));
xmlDoc = builder.parse(is);
is.close();
String _path, datatype, regularAggregate, description, formula;
String table, tableLoc;
NodeList elements = xmlDoc.getElementsByTagName("*");
for (int j = 0; j < elements.getLength(); j++) {

Element element = (Element) elements.item(j);
String nodeName = element.getNodeName();
if (nodeName=="queryItem" || nodeName=="measure"|| 
nodeName=="calculation" || nodeName=="filter") {
if (element.hasAttribute("_path")) {
path = element.getAttribute("_path"));
} 

依此类推,每个属性

我的 JDOM 尝试。目前,它只打印根元素,我还没有能够比第一个子层更深入:

SAXBuilder saxBuilder = new SAXBuilder();
Document document = saxBuilder.build(inputFile);
System.out.println("Root element :" + document.getRootElement().getName());
Element root = document.getRootElement();
List<Element> rList = root.getChildren("folder");
if (rList!= null) {
for (Element node : rList) {
List<Element> elements = node.getChildren("queryItem");
if (elements!=null) {
for (Element a:elements) {
System.out.println(a.getAttribute("_path"));    
}
elements.size();
rList.removeAll(elements);
}
}

生成的随机包的 xsd 结构:

<?xml version="1.0" encoding="UTF-8"?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified">
<xs:element name="ResponseRoot">
<xs:complexType>
<xs:sequence>
<xs:element ref="folder"/>
<xs:element ref="package"/>
</xs:sequence>
</xs:complexType>
</xs:element>
<xs:element name="package">
<xs:complexType>
<xs:attribute name="description" use="required"/>
<xs:attribute name="name" use="required"/>
<xs:attribute name="screenTip" use="required"/>
</xs:complexType>
</xs:element>
<xs:element name="folder">
<xs:complexType>
<xs:sequence>
<xs:choice minOccurs="0" maxOccurs="unbounded">
<xs:element ref="folder"/>
<xs:element ref="querySubject"/>
</xs:choice>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="filter"/>
</xs:sequence>
<xs:attribute name="_path" use="required"/>
<xs:attribute name="_ref" use="required"/>
<xs:attribute name="description" use="required"/>
<xs:attribute name="isNamespace" use="required" type="xs:integer"/>
<xs:attribute name="name" use="required"/>
<xs:attribute name="screenTip" use="required"/>
</xs:complexType>
</xs:element>
<xs:element name="querySubject">
<xs:complexType>
<xs:sequence>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="queryItem"/>
<xs:element minOccurs="0" maxOccurs="unbounded" ref="queryItemFolder"/>
</xs:sequence>
<xs:attribute name="_path" use="required"/>
<xs:attribute name="_ref" use="required"/>
<xs:attribute name="description" use="required"/>
<xs:attribute name="name" use="required"/>
<xs:attribute name="screenTip" use="required"/>
</xs:complexType>
</xs:element>
<xs:element name="filter">
<xs:complexType>
<xs:attribute name="_path" use="required"/>
<xs:attribute name="_ref" use="required"/>
<xs:attribute name="description" use="required"/>
<xs:attribute name="expression" use="required"/>
<xs:attribute name="name" use="required"/>
<xs:attribute name="screenTip" use="required"/>
</xs:complexType>
</xs:element>
<xs:element name="queryItem">
<xs:complexType>
<xs:attribute name="_path" use="required"/>
<xs:attribute name="_ref" use="required"/>
<xs:attribute name="currency" use="required"/>
<xs:attribute name="datatype" use="required" type="xs:NCName"/>
<xs:attribute name="description" use="required"/>
<xs:attribute name="displayType" use="required" type="xs:NCName"/>
<xs:attribute name="expression" use="required"/>
<xs:attribute name="name" use="required"/>
<xs:attribute name="promptCascadeOnRef" use="required"/>
<xs:attribute name="promptDisplayItemRef" use="required"/>
<xs:attribute name="promptFilterItemRef" use="required"/>
<xs:attribute name="promptType" use="required" type="xs:NCName"/>
<xs:attribute name="regularAggregate" use="required" type="xs:NCName"/>
<xs:attribute name="screenTip" use="required"/>
<xs:attribute name="unSortable" use="required" type="xs:integer"/>
<xs:attribute name="usage" use="required" type="xs:NCName"/>
</xs:complexType>
</xs:element>
xs:element name="queryItemFolder">
<xs:complexType>
<xs:choice minOccurs="0" maxOccurs="unbounded">
<xs:element ref="queryItem"/>
<xs:element ref="queryItemFolder"/>
</xs:choice>
<xs:attribute name="_path" use="required"/>
<xs:attribute name="_ref" use="required"/>
<xs:attribute name="description" use="required"/>
<xs:attribute name="name" use="required"/>
<xs:attribute name="screenTip" use="required"/>
</xs:complexType>
</xs:element>
</xs:schema>

对于嵌套结构,如果为每个元素类型创建一个方法,则最容易管理。

public static void main(String[] args) throws Exception {
String xml = "<root>" +
"<folder name="A">" +
"<folder name="B">" +
"<book name="Learn Java">" +
"<chapter name="Hello, World!"/>" +
"<chapter name="Variables and Types"/>" +
"</book>" +
"</folder>" +
"</folder>" +
"</root>";
XMLInputFactory factory = XMLInputFactory.newFactory();
XMLStreamReader reader = factory.createXMLStreamReader(new StringReader(xml));
try {
reader.nextTag(); // Position on root element
String tagName = reader.getLocalName();
if (! tagName.equals("root"))
throw new XMLStreamException("Expected <root> element, found: " + tagName, reader.getLocation());
parseRoot(reader);
} finally {
reader.close();
}
}
private static void parseRoot(XMLStreamReader reader) throws XMLStreamException {
while (reader.nextTag() != XMLStreamConstants.END_ELEMENT) {
String tagName = reader.getLocalName();
if (tagName.equals("folder")) {
parseFolder(reader, Collections.emptyList());
} else {
throw new XMLStreamException("Expected <folder> element, found: " + tagName, reader.getLocation());
}
}
}
private static void parseFolder(XMLStreamReader reader, List<String> parentPaths) throws XMLStreamException {
String folderName = reader.getAttributeValue(null, "name");
if (folderName == null)
throw new XMLStreamException("Missing 'name' attribute on <folder> element", reader.getLocation());
List<String> folderPath = new ArrayList<>(parentPaths.size() + 1);
folderPath.addAll(parentPaths);
folderPath.add(folderName);
while (reader.nextTag() != XMLStreamConstants.END_ELEMENT) {
String tagName = reader.getLocalName();
if (tagName.equals("folder")) {
parseFolder(reader, folderPath);
} else if (tagName.equals("book")) {
parseBook(reader, folderPath);
} else {
throw new XMLStreamException("Expected <folder> or <book> element, found: " + tagName, reader.getLocation());
}
}
}
private static void parseBook(XMLStreamReader reader, List<String> folderPath) throws XMLStreamException {
String bookName = reader.getAttributeValue(null, "name");
if (bookName == null)
throw new XMLStreamException("Missing 'name' attribute on <book> element", reader.getLocation());
while (reader.nextTag() != XMLStreamConstants.END_ELEMENT) {
String tagName = reader.getLocalName();
if (tagName.equals("chapter")) {
parseChapter(reader, folderPath, bookName);
} else {
throw new XMLStreamException("Expected <chapter> element, found: " + tagName, reader.getLocation());
}
}
}
private static void parseChapter(XMLStreamReader reader, List<String> folderPath, String bookName) throws XMLStreamException {
String chapterName = reader.getAttributeValue(null, "name");
if (chapterName == null)
throw new XMLStreamException("Missing 'name' attribute on <chapter> element", reader.getLocation());
if (! reader.getElementText().isEmpty())
throw new XMLStreamException("<chapter> element must be empty", reader.getLocation());
System.out.println("Found:");
System.out.println("  Folder:  " + folderPath);
System.out.println("  Book:    " + bookName);
System.out.println("  Chapter: " + chapterName);
}

输出

Found:
Folder:  [A, B]
Book:    Learn Java
Chapter: Hello, World!
Found:
Folder:  [A, B]
Book:    Learn Java
Chapter: Variables and Types

最新更新