我正在处理75GB的XML文件,因此不可能将它们加载到内存中并构建DOM XML树。因此,我求助于在例如 10k 行的块中处理行块(使用readr::read_lines_chunked
(。这是一个 N=3 行的小演示,我在其中提取构建tibble
所需的数据,但这不是很快:
library(tidyverse)
xml <- c("<row Id="4" Attrib1="1" Attrib2="7" Attrib3="2008-07-31T21:42:52.667" Attrib4="645" Attrib5="45103" Attrib6="fjbnjahkcbvahjsvdghvadjhavsdjbaJKHFCBJHABCJKBASJHcvbjavbfcjkhabcjkhabsckajbnckjasbnckjbwjhfbvjahsdcvbzjhvcbwiebfewqkn" Attrib7="8" Attrib8="11652943" Attrib9="Rich B" Attrib10="2019-09-03T17:25:25.207" Attrib11="2019-10-21T14:03:54.607" Attrib12="1" Attrib13="a|b|c|d|e|f|g" Attrib14="13" Attrib15="3" Attrib16="49" Attrib17="2012-10-31T16:42:47.213"/>",
"<row Id="5" Attrib1="2" Attrib2="8" Attrib3="2008-07-31T21:42:52.999" Attrib4="649" Attrib5="7634" Attrib6="fjbnjahkcbvahjsvdghvadjhavsdjbaJKHFCBJHABCJKBASJHcvbjavbfcjkhabcjkhabsckajbnckjasbnckjbwjhfbvjahsdcvbzjhvcbwiebfewqkn" Attrib7="8" Attrib8="11652943" Attrib9="Rich B" Attrib10="2019-09-03T17:25:25.207" Attrib11="2019-10-21T14:03:54.607" Attrib12="2" Attrib13="a|b|c|d|e|f|g" Attrib14="342" Attrib15="43" Attrib16="767" Attrib17="2012-10-31T16:42:47.213"/>",
"<row Id="6" Attrib1="3" Attrib2="9" Attrib3="2008-07-31T21:42:52.999" Attrib4="348" Attrib5="2732" Attrib6="djhfbsdjhfbijhsdbfjkdbnfkjndaskjfnskjdlnfkjlsdnf" Attrib7="9" Attrib8="34873" Attrib9="FHDHsf" Attrib10="2019-09-03T17:25:25.207" Attrib11="2019-10-21T14:03:54.607" Attrib12="3" Attrib13="a|b|c|d|e|f|g" Attrib14="342" Attrib15="43" Attrib16="767" Attrib17="2012-10-31T16:42:47.4333"/>")
pattern <- paste(".*(Id="\d+") ",
"(Attrib1="\d+") ",
"(Attrib2="\d+") ",
"(Attrib3="[0-9]+-[0-9]+-[0-9]+T[0-9]+:[0-9]+:[0-9]+[0-9]+.[0-9]+") ",
"(Attrib4="\d+") ",
"(Attrib5="\d+")",
".*(Attrib8="\d+") ",
".*(Attrib10="[0-9]+-[0-9]+-[0-9]+T[0-9]+:[0-9]+:[0-9]+[0-9]+.[0-9]+") ",
"(Attrib11="[0-9]+-[0-9]+-[0-9]+T[0-9]+:[0-9]+:[0-9]+[0-9]+.[0-9]+")",
".*(Attrib13="([a-z]|[0-9]|\||\s)+") ",
"(Attrib14="\d+") ",
"(Attrib15="\d+") ",
"(Attrib16="\d+")",
sep="")
# match the groups in pattern and extract the matches
tmp <- str_match(xml, pattern)[,-c(1,12)]
# remove non matching NA rows
r <- which(is.na(tmp[,1]))
if (length(r) > 0) {
tmp <- tmp[-r,]
}
# remove the metadata and stay with the data within the double quotes only
tmp <- apply(tmp, 1, function(s) {
str_remove_all(str_match(s, "(".*")")[,-1], """)
})
# need the transposed version of tmp
tmp <- t(tmp)
tmp
# convert to a tibble
colnames(tmp) <- c("Id", "Attrib1", "Attrib2", "Attrib3", "Attrib4", "Attrib5", "Attrib8", "Attrib10", "Attrib11", "Attrib13", "Attrib14", "Attrib15", "Attrib16")
as_tibble(tmp)
在性能方面有更好的方法吗?
更新:我在 10k 行(而不是 3 行(上对上面的代码进行了基准测试,它是 900 秒。然后,我将属性正则表达式组的数量从 13 个减少到 7 个(只有至关重要的组(,并且相同的基准测试下降到 128 秒。
外推到9731474行,我从~10天到~35小时。然后,我使用 Linux 命令将大文件拆分为 6 个文件split -l1621913 -d Huge.xml Huge_split_ --verbose
以匹配我拥有的内核数量,现在在每个拆分文件上并行运行代码......所以我正在看 35/6=~5.8 小时...这还不错。我愿意:
library(doMC)
registerDoMC(6)
resultList <- foreach (i=0:5) %dopar% {
file <- sprintf('Huge_split_0%d', i)
partial <- # run the chunk algorithm on file
return(partial)
}
使用xml2
,我能够获得明显更好的处理时间,尤其是在更大的规模下。由于我并不完全精通xml2
,因此可能还有另一种方法可以做得更好。
library(stringr)
func_regex <- function(xmlvec) {
tmp <- str_match(xmlvec, pattern)[,-c(1,12)]
# remove non matching NA rows
r <- which(is.na(tmp[,1]))
if (length(r) > 0) {
tmp <- tmp[-r,]
}
# remove the metadata and stay with the data within the double quotes only
tmp <- apply(tmp, 1, function(s) {
str_remove_all(str_match(s, "(".*")")[,-1], """)
})
# need the transposed version of tmp
tmp <- as.data.frame(t(tmp))
colnames(tmp) <- c("Id", "Attrib1", "Attrib2", "Attrib3", "Attrib4", "Attrib5", "Attrib8", "Attrib10", "Attrib11", "Attrib13", "Attrib14", "Attrib15", "Attrib16")
tmp
}
library(xml2)
func_xml2 <- function(xmlvec) {
as.data.frame(do.call(
rbind,
lapply(xml_children(read_xml(paste("<xml>", paste(xmlvec, collapse=""), "</xml>"))),
function(x) xml_attrs(x))
))
}
(编辑:我意识到我正在从func_regex
伸出援手使用pattern
,这是一种草率的违反范围。也许我会修复它并更新基准测试,我认为它不会提高xml2
的相对速度改进。
足够相似的输出:
str(func_regex(xml))
# 'data.frame': 3 obs. of 13 variables:
# $ Id : Factor w/ 3 levels "4","5","6": 1 2 3
# $ Attrib1 : Factor w/ 3 levels "1","2","3": 1 2 3
# $ Attrib2 : Factor w/ 3 levels "7","8","9": 1 2 3
# $ Attrib3 : Factor w/ 2 levels "2008-07-31T21:42:52.667",..: 1 2 2
# $ Attrib4 : Factor w/ 3 levels "348","645","649": 2 3 1
# $ Attrib5 : Factor w/ 3 levels "2732","45103",..: 2 3 1
# $ Attrib8 : Factor w/ 2 levels "11652943","34873": 1 1 2
# $ Attrib10: Factor w/ 1 level "2019-09-03T17:25:25.207": 1 1 1
# $ Attrib11: Factor w/ 1 level "2019-10-21T14:03:54.607": 1 1 1
# $ Attrib13: Factor w/ 1 level "a|b|c|d|e|f|g": 1 1 1
# $ Attrib14: Factor w/ 2 levels "13","342": 1 2 2
# $ Attrib15: Factor w/ 2 levels "3","43": 1 2 2
# $ Attrib16: Factor w/ 2 levels "49","767": 1 2 2
str(func_xml2(xml))
# 'data.frame': 3 obs. of 18 variables:
# $ Id : Factor w/ 3 levels "4","5","6": 1 2 3
# $ Attrib1 : Factor w/ 3 levels "1","2","3": 1 2 3
# $ Attrib2 : Factor w/ 3 levels "7","8","9": 1 2 3
# $ Attrib3 : Factor w/ 2 levels "2008-07-31T21:42:52.667",..: 1 2 2
# $ Attrib4 : Factor w/ 3 levels "348","645","649": 2 3 1
# $ Attrib5 : Factor w/ 3 levels "2732","45103",..: 2 3 1
# $ Attrib6 : Factor w/ 2 levels "djhfbsdjhfbijhsdbfjkdbnfkjndaskjfnskjdlnfkjlsdnf",..: 2 2 1
# $ Attrib7 : Factor w/ 2 levels "8","9": 1 1 2
# $ Attrib8 : Factor w/ 2 levels "11652943","34873": 1 1 2
# $ Attrib9 : Factor w/ 2 levels "FHDHsf","Rich B": 2 2 1
# $ Attrib10: Factor w/ 1 level "2019-09-03T17:25:25.207": 1 1 1
# $ Attrib11: Factor w/ 1 level "2019-10-21T14:03:54.607": 1 1 1
# $ Attrib12: Factor w/ 3 levels "1","2","3": 1 2 3
# $ Attrib13: Factor w/ 1 level "a|b|c|d|e|f|g": 1 1 1
# $ Attrib14: Factor w/ 2 levels "13","342": 1 2 2
# $ Attrib15: Factor w/ 2 levels "3","43": 1 2 2
# $ Attrib16: Factor w/ 2 levels "49","767": 1 2 2
# $ Attrib17: Factor w/ 2 levels "2012-10-31T16:42:47.213",..: 1 1 2
标杆:
microbenchmark::microbenchmark(
func_regex(xml),
func_xml2(xml),
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# func_regex(xml) 1.4306 1.4728 1.57756 1.48660 1.5875 2.2086 10
# func_xml2(xml) 1.0714 1.1075 1.18385 1.15275 1.1875 1.5418 10
xml1000 <- rep(xml, 1000)
microbenchmark::microbenchmark(
func_regex(xml1000),
func_xml2(xml1000),
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# func_regex(xml1000) 458.4921 531.1159 570.1703 534.8204 538.6754 782.6863 10
# func_xml2(xml1000) 107.1230 107.7632 110.7316 109.1315 111.1904 121.8560 10
xml100000 <- rep(xml, 100000)
microbenchmark::microbenchmark(
func_regex(xml100000),
func_xml2(xml100000),
times = 10
)
# Unit: seconds
# expr min lq mean median uq max neval
# func_regex(xml100000) 52.89568 53.97438 55.64431 54.67441 56.95971 61.86949 10
# func_xml2(xml100000) 13.77857 16.02327 16.50498 16.58733 17.38458 17.81042 10