R 中执行此字符串正则表达式处理的最快替代方法是什么？

我正在处理75GB的XML文件，因此不可能将它们加载到内存中并构建DOM XML树。因此，我求助于在例如 10k 行的块中处理行块(使用readr::read_lines_chunked(。这是一个 N=3 行的小演示，我在其中提取构建tibble所需的数据，但这不是很快：

library(tidyverse)
xml <- c("<row Id="4" Attrib1="1" Attrib2="7" Attrib3="2008-07-31T21:42:52.667" Attrib4="645" Attrib5="45103" Attrib6="fjbnjahkcbvahjsvdghvadjhavsdjbaJKHFCBJHABCJKBASJHcvbjavbfcjkhabcjkhabsckajbnckjasbnckjbwjhfbvjahsdcvbzjhvcbwiebfewqkn" Attrib7="8" Attrib8="11652943" Attrib9="Rich B" Attrib10="2019-09-03T17:25:25.207" Attrib11="2019-10-21T14:03:54.607" Attrib12="1" Attrib13="a|b|c|d|e|f|g" Attrib14="13" Attrib15="3" Attrib16="49" Attrib17="2012-10-31T16:42:47.213"/>",
"<row Id="5" Attrib1="2" Attrib2="8" Attrib3="2008-07-31T21:42:52.999" Attrib4="649" Attrib5="7634" Attrib6="fjbnjahkcbvahjsvdghvadjhavsdjbaJKHFCBJHABCJKBASJHcvbjavbfcjkhabcjkhabsckajbnckjasbnckjbwjhfbvjahsdcvbzjhvcbwiebfewqkn" Attrib7="8" Attrib8="11652943" Attrib9="Rich B" Attrib10="2019-09-03T17:25:25.207" Attrib11="2019-10-21T14:03:54.607" Attrib12="2" Attrib13="a|b|c|d|e|f|g" Attrib14="342" Attrib15="43" Attrib16="767" Attrib17="2012-10-31T16:42:47.213"/>",
"<row Id="6" Attrib1="3" Attrib2="9" Attrib3="2008-07-31T21:42:52.999" Attrib4="348" Attrib5="2732" Attrib6="djhfbsdjhfbijhsdbfjkdbnfkjndaskjfnskjdlnfkjlsdnf" Attrib7="9" Attrib8="34873" Attrib9="FHDHsf" Attrib10="2019-09-03T17:25:25.207" Attrib11="2019-10-21T14:03:54.607" Attrib12="3" Attrib13="a|b|c|d|e|f|g" Attrib14="342" Attrib15="43" Attrib16="767" Attrib17="2012-10-31T16:42:47.4333"/>")
pattern <- paste(".*(Id="\d+") ",
"(Attrib1="\d+") ",
"(Attrib2="\d+") ",
"(Attrib3="[0-9]+-[0-9]+-[0-9]+T[0-9]+:[0-9]+:[0-9]+[0-9]+.[0-9]+") ",
"(Attrib4="\d+") ",
"(Attrib5="\d+")",
".*(Attrib8="\d+") ",
".*(Attrib10="[0-9]+-[0-9]+-[0-9]+T[0-9]+:[0-9]+:[0-9]+[0-9]+.[0-9]+") ",
"(Attrib11="[0-9]+-[0-9]+-[0-9]+T[0-9]+:[0-9]+:[0-9]+[0-9]+.[0-9]+")",
".*(Attrib13="([a-z]|[0-9]|\||\s)+") ",
"(Attrib14="\d+") ",
"(Attrib15="\d+") ",
"(Attrib16="\d+")",
sep="")
# match the groups in pattern and extract the matches
tmp <- str_match(xml, pattern)[,-c(1,12)]
# remove non matching NA rows  
r <- which(is.na(tmp[,1]))
if (length(r) > 0) {
tmp <- tmp[-r,]
}
# remove the metadata and stay with the data within the double quotes only
tmp <- apply(tmp, 1, function(s) {
str_remove_all(str_match(s, "(".*")")[,-1], """)
})
# need the transposed version of tmp
tmp <- t(tmp)
tmp
# convert to a tibble
colnames(tmp) <- c("Id", "Attrib1", "Attrib2", "Attrib3", "Attrib4", "Attrib5", "Attrib8", "Attrib10", "Attrib11", "Attrib13", "Attrib14", "Attrib15", "Attrib16")
as_tibble(tmp)

在性能方面有更好的方法吗？

更新：我在 10k 行(而不是 3 行(上对上面的代码进行了基准测试，它是 900 秒。然后，我将属性正则表达式组的数量从 13 个减少到 7 个(只有至关重要的组(，并且相同的基准测试下降到 128 秒。

外推到9731474行，我从~10天到~35小时。然后，我使用 Linux 命令将大文件拆分为 6 个文件split -l1621913 -d Huge.xml Huge_split_ --verbose以匹配我拥有的内核数量，现在在每个拆分文件上并行运行代码......所以我正在看 35/6=~5.8 小时...这还不错。我愿意：

library(doMC)
registerDoMC(6)
resultList <- foreach (i=0:5) %dopar% {
file <- sprintf('Huge_split_0%d', i)  
partial <- # run the chunk algorithm on file
return(partial)
}

使用xml2，我能够获得明显更好的处理时间，尤其是在更大的规模下。由于我并不完全精通xml2，因此可能还有另一种方法可以做得更好。

library(stringr)
func_regex <- function(xmlvec) {
tmp <- str_match(xmlvec, pattern)[,-c(1,12)]
# remove non matching NA rows  
r <- which(is.na(tmp[,1]))
if (length(r) > 0) {
tmp <- tmp[-r,]
}
# remove the metadata and stay with the data within the double quotes only
tmp <- apply(tmp, 1, function(s) {
str_remove_all(str_match(s, "(".*")")[,-1], """)
})
# need the transposed version of tmp
tmp <- as.data.frame(t(tmp))
colnames(tmp) <- c("Id", "Attrib1", "Attrib2", "Attrib3", "Attrib4", "Attrib5", "Attrib8", "Attrib10", "Attrib11", "Attrib13", "Attrib14", "Attrib15", "Attrib16")
tmp
}
library(xml2)
func_xml2 <- function(xmlvec) {
as.data.frame(do.call(
rbind,
lapply(xml_children(read_xml(paste("<xml>", paste(xmlvec, collapse=""), "</xml>"))),
function(x) xml_attrs(x))
))
}

(编辑：我意识到我正在从func_regex伸出援手使用pattern，这是一种草率的违反范围。也许我会修复它并更新基准测试，我认为它不会提高xml2的相对速度改进。

足够相似的输出：

str(func_regex(xml))
# 'data.frame': 3 obs. of  13 variables:
#  $ Id      : Factor w/ 3 levels "4","5","6": 1 2 3
#  $ Attrib1 : Factor w/ 3 levels "1","2","3": 1 2 3
#  $ Attrib2 : Factor w/ 3 levels "7","8","9": 1 2 3
#  $ Attrib3 : Factor w/ 2 levels "2008-07-31T21:42:52.667",..: 1 2 2
#  $ Attrib4 : Factor w/ 3 levels "348","645","649": 2 3 1
#  $ Attrib5 : Factor w/ 3 levels "2732","45103",..: 2 3 1
#  $ Attrib8 : Factor w/ 2 levels "11652943","34873": 1 1 2
#  $ Attrib10: Factor w/ 1 level "2019-09-03T17:25:25.207": 1 1 1
#  $ Attrib11: Factor w/ 1 level "2019-10-21T14:03:54.607": 1 1 1
#  $ Attrib13: Factor w/ 1 level "a|b|c|d|e|f|g": 1 1 1
#  $ Attrib14: Factor w/ 2 levels "13","342": 1 2 2
#  $ Attrib15: Factor w/ 2 levels "3","43": 1 2 2
#  $ Attrib16: Factor w/ 2 levels "49","767": 1 2 2
str(func_xml2(xml))
# 'data.frame': 3 obs. of  18 variables:
#  $ Id      : Factor w/ 3 levels "4","5","6": 1 2 3
#  $ Attrib1 : Factor w/ 3 levels "1","2","3": 1 2 3
#  $ Attrib2 : Factor w/ 3 levels "7","8","9": 1 2 3
#  $ Attrib3 : Factor w/ 2 levels "2008-07-31T21:42:52.667",..: 1 2 2
#  $ Attrib4 : Factor w/ 3 levels "348","645","649": 2 3 1
#  $ Attrib5 : Factor w/ 3 levels "2732","45103",..: 2 3 1
#  $ Attrib6 : Factor w/ 2 levels "djhfbsdjhfbijhsdbfjkdbnfkjndaskjfnskjdlnfkjlsdnf",..: 2 2 1
#  $ Attrib7 : Factor w/ 2 levels "8","9": 1 1 2
#  $ Attrib8 : Factor w/ 2 levels "11652943","34873": 1 1 2
#  $ Attrib9 : Factor w/ 2 levels "FHDHsf","Rich B": 2 2 1
#  $ Attrib10: Factor w/ 1 level "2019-09-03T17:25:25.207": 1 1 1
#  $ Attrib11: Factor w/ 1 level "2019-10-21T14:03:54.607": 1 1 1
#  $ Attrib12: Factor w/ 3 levels "1","2","3": 1 2 3
#  $ Attrib13: Factor w/ 1 level "a|b|c|d|e|f|g": 1 1 1
#  $ Attrib14: Factor w/ 2 levels "13","342": 1 2 2
#  $ Attrib15: Factor w/ 2 levels "3","43": 1 2 2
#  $ Attrib16: Factor w/ 2 levels "49","767": 1 2 2
#  $ Attrib17: Factor w/ 2 levels "2012-10-31T16:42:47.213",..: 1 1 2

标杆：

microbenchmark::microbenchmark(
func_regex(xml),
func_xml2(xml),
times = 10
)
# Unit: milliseconds
#             expr    min     lq    mean  median     uq    max neval
#  func_regex(xml) 1.4306 1.4728 1.57756 1.48660 1.5875 2.2086    10
#   func_xml2(xml) 1.0714 1.1075 1.18385 1.15275 1.1875 1.5418    10
xml1000 <- rep(xml, 1000)
microbenchmark::microbenchmark(
func_regex(xml1000),
func_xml2(xml1000),
times = 10
)
# Unit: milliseconds
#                 expr      min       lq     mean   median       uq      max neval
#  func_regex(xml1000) 458.4921 531.1159 570.1703 534.8204 538.6754 782.6863    10
#   func_xml2(xml1000) 107.1230 107.7632 110.7316 109.1315 111.1904 121.8560    10
xml100000 <- rep(xml, 100000)
microbenchmark::microbenchmark(
func_regex(xml100000),
func_xml2(xml100000),
times = 10
)
# Unit: seconds
#                   expr      min       lq     mean   median       uq      max neval
#  func_regex(xml100000) 52.89568 53.97438 55.64431 54.67441 56.95971 61.86949    10
#   func_xml2(xml100000) 13.77857 16.02327 16.50498 16.58733 17.38458 17.81042    10

相关内容

最新更新

热门标签：