将xml表转换为用于read_xmlR的xml

  • 本文关键字:xml xmlR read 转换 用于 sql r xml
  • 更新时间 :
  • 英文 :


我有一个满是XML值的表(都是字符格式(,但无法使用read_XML。我尝试过使用as.list和str来帮助,但得到了以下错误

UseMethod中的错误("read_xml"(:没有适用于"read_xml"的方法应用于类"的对象;列表">

该表有一列,列中充满了以下格式的XML;XML_M":

<Root>
<Product>
<Test_ID value="1" />
<Effective_Date value="2022-01-01" />
<Membership value="Yes" />
<Request>
<Request_ID value="1" />
<Request_type value="Simple" />
</Request>
<Request>
<Request_ID value="2" />
<Request_type value="Complex" />
</Request>
</Product>
</Root>

目标是获取所有XML值,并将它们转换为另一个表,但该表将元素作为列。我真正想问的是,我需要做些什么来允许read_xml读取这列xml。

编辑:在这里附上了一个示例表:表

虽然@r2evans在上一个问题中用R方法有效地回答了您的问题,其中您只需向下迭代数据帧的一列,而不是单个XML文件,但考虑一种SQL方法,通过用下面的XML查询分解数据,您可以从R调用该查询。下面假设XML列存储为XML类型。

R (调整my_table和my_xml_column(

...
xml_query <- (
"SELECT
Test_ID = prod.value('(Test_ID/@value)[1]', 'integer'), 
Effective_Date = prod.value('(Effective_Date/@value)[1]', 'datetime'), 
Membership = prod.value('(Membership/@value)[1]', 'varchar(50)'),
Request_ID = req.value('(Request_ID/@value)[1]', 'integer'),
Request_type = req.value('(Request_type/@value)[1]', 'varchar(50)')
FROM myTable
CROSS APPLY
my_xml_column.nodes('/Root/Product') AS x1(prod)
CROSS APPLY
my_xml_column.nodes('/Root/Product/Request') AS x2(req)"
)
xml_df <- DBI::dbGetQuery(conn, xml_query)

SQL Fiddle演示

也许可以使用上一个答案中的代码来结束这个问题。

样本数据:

myframe <- structure(list(id = 1:2, xml = c("<Root>n<Product>n    <Test_ID value="1" />n    <Effective_Date value="2022-01-01" />n    <Membership value="Yes" />n    <Request>n      <Request_ID value="1" />n      <Request_type value="Simple" />n    </Request>n    <Request>n      <Request_ID value="2" />n      <Request_type value="Complex" />n    </Request>n</Product>n</Root>", "<Root>n<Product>n    <Test_ID value="1" />n    <Effective_Date value="2022-01-01" />n    <Membership value="Yes" />n    <Request>n      <Request_ID value="3" />n      <Request_type value="Simple" />n    </Request>n    <Request>n      <Request_ID value="4" />n      <Request_type value="Complex" />n    </Request>n</Product>n</Root>" )), class = "data.frame", row.names = c(NA, -2L))
myframe2 <- structure(list(id = 1:2, xml = c("<Root>n<Product>n    <Test_ID value="1" />n    <Effective_Date value="2022-01-01" />n    <Membership value="Yes" />n    <Request>n      <Request_ID value="1" />n      <Request_type value="Simple" />n    </Request>n    <Request>n      <Request_ID value="2" />n      <Request_type value="Complex" />n    </Request>n</Product>n</Root>", "<Root>n<Product>n    <Test_ID value="1" />n    <Effective_Date value="2022-01-01" />n    <Request>n      <Request_ID value="3" />n      <Request_type value="Simple" />n    </Request>n    <Request>n      <Request_ID value="4" />n      <Request_type value="Complex" />n    </Request>n</Product>n</Root>" )), class = "data.frame", row.names = c(NA, -2L))

上一个答案的函数:

func1 <- function(z) if (is.null(names(z))) attr(z, "value") else lapply(z, func1)
merge.list <- function(A, B) {
# normalize lengths, just in case, since I think you have more than one $Product
A <- lapply(A, `length<-`, max(lengths(A)))
B <- lapply(B, `length<-`, max(lengths(B)))
BnotA <- setdiff(names(B), names(A))
AnotB <- setdiff(names(A), names(B))
inboth <- intersect(names(A), names(B))
A[BnotA] <- replicate(length(BnotA), rep(NA, max(lengths(A))), simplify = FALSE)
A[AnotB] <- lapply(A[AnotB], function(z) c(z, rep(NA, max(lengths(B)))))
A[inboth] <- Map(c, A[inboth], B[inboth])
A
}

处理xmls:的列

intermediate <- lapply(myframe$xml, function(X) xml2::as_list(xml2::read_xml(X)))
final <- lapply(intermediate, function(L) {
do.call(rbind.data.frame, lapply(func1(L$Root), function(pr) {
as.data.frame(lapply(split(pr, names(pr)), function(Y) Reduce(merge.list, Y)))
}))
})
final
# [[1]]
#           Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1     2022-01-01        Yes                  1               Simple       1
# Product.2     2022-01-01        Yes                  2              Complex       1
# [[2]]
#           Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1     2022-01-01        Yes                  3               Simple       1
# Product.2     2022-01-01        Yes                  4              Complex       1

根据结构,您可能可以执行以下操作:

do.call(rbind, final)
#            Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1      2022-01-01        Yes                  1               Simple       1
# Product.2      2022-01-01        Yes                  2              Complex       1
# Product.11     2022-01-01        Yes                  3               Simple       1
# Product.21     2022-01-01        Yes                  4              Complex       1

如果有任何差异(缺少列(,那么您可能需要其他包提供的rbind的变体之一。例如,如果myframe$xml的第二个值没有"Membership"(如上面的myframe2(,则

intermediate2 <- lapply(myframe2$xml, function(X) xml2::as_list(xml2::read_xml(X)))
final2 <- lapply(intermediate2, function(L) {
do.call(rbind.data.frame, lapply(func1(L$Root), function(pr) {
as.data.frame(lapply(split(pr, names(pr)), function(Y) Reduce(merge.list, Y)))
}))
})
final2
# [[1]]
#           Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1     2022-01-01        Yes                  1               Simple       1
# Product.2     2022-01-01        Yes                  2              Complex       1
# [[2]]
#           Effective_Date Request.Request_ID Request.Request_type Test_ID
# Product.1     2022-01-01                  3               Simple       1
# Product.2     2022-01-01                  4              Complex       1

不幸的是

do.call(rbind, final2)
# Error in rbind(deparse.level, ...) : 
#   numbers of columns of arguments do not match

但是我们可以做

dplyr::bind_rows(final2)
#               Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1...1     2022-01-01        Yes                  1               Simple       1
# Product.2...2     2022-01-01        Yes                  2              Complex       1
# Product.1...3     2022-01-01       <NA>                  3               Simple       1
# Product.2...4     2022-01-01       <NA>                  4              Complex       1
data.table::rbindlist(final2, fill = TRUE, use.names = TRUE)
#    Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
#            <char>     <char>             <char>               <char>  <char>
# 1:     2022-01-01        Yes                  1               Simple       1
# 2:     2022-01-01        Yes                  2              Complex       1
# 3:     2022-01-01       <NA>                  3               Simple       1
# 4:     2022-01-01       <NA>                  4              Complex       1

最新更新