运行基准测试时XML解析缓慢



我想测量GoLang解析XML文件所需的时间。所以,我决定写一个基准。

我有一个函数可以生成一个包含XML文档的io.Reader

// PRIVATE: createSampleXMLReader creates an io.Reader instance that contains 10.000 '<Node />' elements which are
//          suitable for running a benchmark test.
func createSampleXMLReader(
nodeElementCount int) io.Reader {
xmlContents := new(strings.Builder)
xmlContents.WriteString("<ROOT>n")
for i := 0; i < nodeElementCount; i++ {
appendNodeXMLElement(xmlContents)
}
xmlContents.WriteString("</ROOT>")
return strings.NewReader(xmlContents.String())
}
// PRIVATE: appendNodeXMLElement appends a '<Node />' elements to an existing io.Reader instance.
func appendNodeXMLElement(
xmlDocument *strings.Builder) {
xmlDocument.WriteString("<Node id="0" position="0" depth="0" parent="0">n")
xmlDocument.WriteString("    <Name>Name</Name>n")
xmlDocument.WriteString("    <Description>Description</Description>n")
xmlDocument.WriteString("    <OwnInformation>n")
xmlDocument.WriteString("        <Title>Title</Title>n")
xmlDocument.WriteString("        <Description>Description</Description>n")
xmlDocument.WriteString("    </OwnInformation>n")
xmlDocument.WriteString("    <Assets>n")
xmlDocument.WriteString("        <Asset id="0" position="0" type="0" category="0">n")
xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>n")
xmlDocument.WriteString("            <Description>Description</Description>n")
xmlDocument.WriteString("            <Uri>Uri</Uri>n")
xmlDocument.WriteString("        </Asset>n")
xmlDocument.WriteString("        <Asset id="1" position="1" type="1" category="1">n")
xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>n")
xmlDocument.WriteString("            <Description>Description</Description>n")
xmlDocument.WriteString("            <Uri>Uri</Uri>n")
xmlDocument.WriteString("        </Asset>n")
xmlDocument.WriteString("        <Asset id="2" position="2" type="2" category="2">n")
xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>n")
xmlDocument.WriteString("            <Description>Description</Description>n")
xmlDocument.WriteString("            <Uri>Uri</Uri>n")
xmlDocument.WriteString("        </Asset>n")
xmlDocument.WriteString("        <Asset id="3" position="3" type="3" category="3">n")
xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>n")
xmlDocument.WriteString("            <Description>Description</Description>n")
xmlDocument.WriteString("            <Uri>Uri</Uri>n")
xmlDocument.WriteString("        </Asset>n")
xmlDocument.WriteString("        <Asset id="4" position="4" type="4" category="4">n")
xmlDocument.WriteString("            <OriginalFile>OriginalFile</OriginalFile>n")
xmlDocument.WriteString("            <Description>Description</Description>n")
xmlDocument.WriteString("            <Uri>Uri</Uri>n")
xmlDocument.WriteString("        </Asset>n")
xmlDocument.WriteString("    </Assets>n")
xmlDocument.WriteString("    <Synonyms>n")
xmlDocument.WriteString("        <Synonym>Synonym 0</Synonym>n")
xmlDocument.WriteString("        <Synonym>Synonym 1</Synonym>n")
xmlDocument.WriteString("        <Synonym>Synonym 2</Synonym>n")
xmlDocument.WriteString("        <Synonym>Synonym 3</Synonym>n")
xmlDocument.WriteString("        <Synonym>Synonym 4</Synonym>n")
xmlDocument.WriteString("    </Synonyms>n")
xmlDocument.WriteString("</Node>n")
}

然后,我就有了一个实际解析这个XML文档的函数。

// PRIVATE: parseXML parses an io.Reader instance into a 'Node' struct.
func parseXML(
xmlReader io.Reader) {
xmlDecoder := xml.NewDecoder(xmlReader)
for {
token, _ := xmlDecoder.Token()
if token == nil {
break
}
switch element := token.(type) {
case xml.StartElement:
if element.Name.Local == "Node" {
decodeNodeElement(xmlDecoder, &element)
}
}
}
}
// PRIVATE: decodeNodeElement decodes a '<Node />' element into a 'Node' struct.
func decodeNodeElement(
xmlDecoder *xml.Decoder,
element *xml.StartElement) {
node := new(model.Node)
xmlDecoder.DecodeElement(node, element)
}

然后,我有了执行基准测试的功能:

// PRIVATE: runBenchmarkParseXML performs a benchmark that parses an XML document that contains the given number of
//          '<Node />' element.
func runBenchmarkParseXML(
nodeCount int,
benchmark *testing.B) {
// Arrange.
xmlReader := createSampleXMLReader(nodeCount)
// Act.
for i := 0; i < benchmark.N; i++ {
parseXML(xmlReader)
}
}

然后我有5个函数来执行基准测试。这些用于XML文档中的1、10、100、1000和10000个元素。

func BenchmarkParseXML1(benchmark *testing.B)     { runBenchmarkParseXML(1, benchmark) }
func BenchmarkParseXML10(benchmark *testing.B)    { runBenchmarkParseXML(10, benchmark) }
func BenchmarkParseXML100(benchmark *testing.B)   { runBenchmarkParseXML(100, benchmark) }
func BenchmarkParseXML1000(benchmark *testing.B)  { runBenchmarkParseXML(1000, benchmark) }
func BenchmarkParseXML10000(benchmark *testing.B) { runBenchmarkParseXML(10000, benchmark) }

当我运行这个基准测试时,我看到以下输出:

BenchmarkParseXML1-4             5000000               226 ns/op
BenchmarkParseXML10-4           10000000               230 ns/op
BenchmarkParseXML100-4           5000000               226 ns/op
BenchmarkParseXML1000-4          5000000               254 ns/op
BenchmarkParseXML10000-4               1        1690998100 ns/op

为什么在解析包含10.000个元素的XML文件的基准测试中会有这样的差异,而其余的基准测试时间是稳定的?

我的基准测试是新的还是ParseXML方法的实现不正确。

编辑:节点结构

// Node represents a '<Node />' element in the XML document.
type Node struct {
ID             int    `xml:"id,attr"`
Position       int    `xml:"position,attr"`
Depth          int    `xml:"depth,attr"`
Parent         string `xml:"parent,attr"`
Name           string `xml:"Name"`
Description    string `xml:"Description"`
OwnInformation struct {
Title       string `xml:"Title"`
Description string `xml:"Description"`
} `xml:"OwnInformation"`
Assets []struct {
ID           string `xml:"id,attr"`
Position     int    `xml:"position,attr"`
Type         string `xml:"type,attr"`
Category     int    `xml:"category,attr"`
OriginalFile string `xml:"OriginalFile"`
Description  string `xml:"Description"`
URI          string `xml:"Uri"`
} `xml:"Assets>Asset"`
Synonyms []string `xml:"Synonyms>Synonym"`
}

提前感谢您的指导。

您的基准测试存在缺陷。事实上,你将输入增加了10倍,但持续时间始终保持不变,这应该会让你对这个基准非常怀疑。

你每次都在重复使用同一个阅读器。只有每个基准测试的第一次迭代才能真正起作用。进一步的调用将从已经处于EOF的读取器中读取。

更改设置,使其返回一个字节切片,并为每次迭代构造一个新的读取器。这将产生预期的结果:

func createSampleXMLDoc(nodeElementCount int) []byte {
xmlContents := &bytes.Buffer{}
xmlContents.WriteString("<ROOT>n")
for i := 0; i < nodeElementCount; i++ {
appendNodeXMLElement(xmlContents)
}
xmlContents.WriteString("</ROOT>")
return xmlContents.Bytes()
}
func runBenchmarkParseXML(nodeCount int, b *testing.B) {
doc := createSampleXMLDoc(nodeCount)
for i := 0; i < b.N; i++ {
xmlReader := bytes.NewReader(doc)
parseXML(xmlReader)
}
}

我的机器上的结果(正如预期的那样,输入增加10倍,时间增加10倍):

$ go test -benchtime=5s -bench .
goos: linux
goarch: amd64
BenchmarkParseXML1-8              100000            115978 ns/op
BenchmarkParseXML10-8              10000           1147605 ns/op
BenchmarkParseXML100-8              1000          11586980 ns/op
BenchmarkParseXML1000-8               50         124199120 ns/op
BenchmarkParseXML10000-8               5        1003668966 ns/op

-benchtime=5s将每个基准的默认时间从1秒增加到5秒。在最后一种情况下,一秒钟不足以进行一次以上的迭代,从而产生不可靠的结果。这也是为什么在原始基准中看到大量数据的原因。第一次迭代很慢,但之后的每一次都会立即返回,因此平均时间急剧下降。顺便说一句,在基准测试中只看到一个迭代是另一个危险信号。实际上,即使是五次迭代对于可靠的测量来说仍然相当低。

这个故事的寓意:检查你的错误

相关内容

  • 没有找到相关文章

最新更新