将数据从 xml 提取到 R 数据帧



我对 R 中的XMLxml2包相当陌生,我正在努力将数据从 xml 提取到数据帧中。

来自 xml 文件的示例数据

<?xml version="1.0" encoding="utf-8"?>
<mod:ModificationSet xmlns:hci="http://riziv.fgov.be/szv/HealthCareInstitution" xmlns:per="http://riziv.fgov.be/szv/Person" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:pto="http://riziv.fgov.be/szv/PersonToOrganization" xmlns:org="http://riziv.fgov.be/szv/Organization" xmlns:hca="http://riziv.fgov.be/szv/HealthCareAppliance" xmlns:ati="http://riziv.fgov.be/szv/HcApplianceToHcInstitution" xmlns:p12="http://www.w3.org/2001/XMLSchema-instance" xmlns:szv="http://riziv.fgov.be/szv/BasicTypes" xmlns:hcw="http://riziv.fgov.be/szv/HealthCareWorker" xMmmlns:mod="http://riziv.fgov.be/szv/ModificationSet" xmlns:dev="http://riziv.fgov.be/szv/Device" xmlns:wti="http://riziv.fgov.be/szv/HcWorkerToHcInstitution">
<mod:Payload>
<mod:Modifications>
<mod:Modification>
<mod:Context>
<szv:Origin>63080900</szv:Origin>
<szv:CreationDate>2018-04-05</szv:CreationDate>
<szv:OperationType>01</szv:OperationType>
<szv:OperationDate>2018-04-05</szv:OperationDate>
</mod:Context>
<mod:HealthCareAppliance>
<hca:Identification>
<hca:RizivNumber>00000182</hca:RizivNumber>
</hca:Identification>
<hca:Device>
<dev:DeviceNumber>30016</dev:DeviceNumber>
<dev:DeviceType>PET-CT</dev:DeviceType>
<dev:Model>Philips-Gemini TF Big Bore PET/CT</dev:Model>
<dev:StartDateInvoicing>2016-06-01</dev:StartDateInvoicing>
<dev:EndDateInvoicing p12:nil="true" />
<dev:LocationIsAddress>false</dev:LocationIsAddress>
<dev:IsFixedDevice>true</dev:IsFixedDevice>
<dev:IsExtraMuros>false</dev:IsExtraMuros>
</hca:Device>
</mod:HealthCareAppliance>
</mod:Modification>
<mod:Modification>
<mod:Context>
<szv:Origin>63080900</szv:Origin>
<szv:CreationDate>2018-04-05</szv:CreationDate>
<szv:OperationType>01</szv:OperationType>
<szv:OperationDate>2010-07-13</szv:OperationDate>
</mod:Context>
<mod:HealthCareAppliance>
<hca:Identification>
<hca:RizivNumber>00000182</hca:RizivNumber>
</hca:Identification>
<hca:Status>
<hca:StatusCode>InUse</hca:StatusCode>
<hca:StatusStartDate>2010-07-13</hca:StatusStartDate>
</hca:Status>
</mod:HealthCareAppliance>
</mod:Modification>
<mod:Modification>
<mod:Context>
<szv:Origin>63080900</szv:Origin>
<szv:CreationDate>2018-04-05</szv:CreationDate>
<szv:OperationType>01</szv:OperationType>
<szv:OperationDate>2018-04-05</szv:OperationDate>
</mod:Context>
<mod:HcApplianceToHcInstitution>
<ati:HealthCareInstitution>
<ati:RizivNumber>71024388</ati:RizivNumber>
<ati:InstitutionCode>710</ati:InstitutionCode>
</ati:HealthCareInstitution>
<ati:HealthCareAppliance>
<ati:RizivNumber>00000182</ati:RizivNumber>
</ati:HealthCareAppliance>
<ati:Period>
<szv:StartDate>2016-08-19</szv:StartDate>
<szv:EndDate p12:nil="true" />
</ati:Period>
</mod:HcApplianceToHcInstitution>
</mod:Modification>
</mod:Modifications>
</mod:Payload>

这就是我迄今为止所做的,没有任何成功!脚本运行没有任何错误,但它无法提取任何数据,并在最后返回Null值。

library(XML)
xmldoc <- xmlParse("BAS_SIT_HCA_20180405141931.xml", useInternalNodes=TRUE)
class(xmldoc)
namespace_list <- c(mod="http://riziv.fgov.be/szv/ModificationSet",
szv="http://riziv.fgov.be/szv/BasicTypes",
hca="http://riziv.fgov.be/szv/HealthCareAppliance",
dev="http://riziv.fgov.be/szv/Device",
ati="http://riziv.fgov.be/szv/HcApplianceToHcInstitution")
do.call(rbind, xpathApply(xmldoc, "//mod:ModificationSet/mod:Payload/mod:Modifications
/mod:Modification", namespaces=namespace_list, function(node) {
Origin <- xmlValue(node[["./mod:Context/szv:Origin"]])
CreationDate <- xmlValue(node[["./mod:Context/szv:CreationDate"]])
OperationType <- xmlValue(node[["./mod:Context/szv:OperationType"]])
OperationDate <- xmlValue(node[["./mod:Context/szv:OperationDate"]])
xp1 <- "./mod:HealthCareAppliance/hca:Identification/hca:RizivNumber"
RizivNumber <- xpathSApply(node, xp1, namespaces=namespace_list, xmlValue)
if(is.null(RizivNumber)) RizivNumber <- NA
xp2 <- "./mod:HealthCareAppliance/hca:Device/dev:DeviceNumber"
DeviceNumber <- xpathSApply(node, xp2, namespaces=namespace_list, xmlValue)
if(is.null(DeviceNumber)) DeviceNumber <- NA
xp3 <- "./mod:HealthCareAppliance/hca:Device/dev:DeviceType"
DeviceType <- xpathSApply(node, xp3, namespaces=namespace_list, xmlValue)
if(is.null(DeviceType)) DeviceType <- NA
xp4 <- "./mod:HealthCareAppliance/hca:Device/dev:DeviceSubType"
DeviceSubType <- xpathSApply(node, xp4, namespaces=namespace_list, xmlValue)
if(is.null(DeviceSubType)) DeviceSubType <- NA
xp5 <- "./mod:HealthCareAppliance/hca:Device/dev:Model"
Model <- xpathSApply(node, xp5, namespaces=namespace_list, xmlValue)
if(is.null(Model)) Model <- NA
xp6 <- "./mod:HealthCareAppliance/hca:Device/dev:StartDateInvoicing"
StartDateInvoicing <- xpathSApply(node, xp6, namespaces=namespace_list, xmlValue)
if(is.null(StartDateInvoicing)) StartDateInvoicing <- NA
xp7 <- "./mod:HealthCareAppliance/hca:Device/dev:EndDateInvoicing"
EndDateInvoicing <- xpathSApply(node, xp7, namespaces=namespace_list, xmlValue)
if(is.null(EndDateInvoicing)) EndDateInvoicing <- NA
xp8 <- "./mod:HealthCareAppliance/hca:Device/dev:LocationIsAddress"
LocationIsAddress <- xpathSApply(node, xp8, namespaces=namespace_list, xmlValue)
if(is.null(LocationIsAddress)) LocationIsAddress <- NA
xp9 <- "./mod:HealthCareAppliance/hca:Device/dev:IsFixedDevice"
IsFixedDevice <- xpathSApply(node, xp9, namespaces=namespace_list, xmlValue)
if(is.null(IsFixedDevice)) IsFixedDevice <- NA
xp10 <- "./mod:HealthCareAppliance/hca:Device/dev:IsExtraMuros"
IsExtraMuros <- xpathSApply(node, xp10, namespaces=namespace_list, xmlValue)
if(is.null(IsExtraMuros)) IsExtraMuros <- NA
xp11 <- "./mod:HealthCareAppliance/hca:Status/hca:StatusCode"
StatusCode <- xpathSApply(node, xp11, namespaces=namespace_list, xmlValue)
if(is.null(StatusCode)) StatusCode <- NA
xp12 <- "./mod:HealthCareAppliance/hca:Status/hca:StatusStartDate"
StatusStartDate <- xpathSApply(node, xp12, namespaces=namespace_list, xmlValue)
if(is.null(StatusStartDate)) StatusStartDate <- NA
xp13 <- "./mod:HcApplianceToHcInstitution/ati:HealthCareInstitution/ati:RizivNumber"
RizivNumber_ <- xpathSApply(node, xp13, namespaces=namespace_list, xmlValue)
if(is.null(RizivNumber_)) RizivNumber_ <- NA
xp14 <- "./mod:HcApplianceToHcInstitution/ati:HealthCareInstitution/ati:InstitutionCode"
InstitutionCode <- xpathSApply(node, xp14, namespaces=namespace_list, xmlValue)
if(is.null(InstitutionCode)) InstitutionCode <- NA
xp15 <- "./mod:HcApplianceToHcInstitution/ati:HealthCareAppliance/ati:RizivNumber"
RizivNumber2 <- xpathSApply(node, xp15, namespaces=namespace_list, xmlValue)
if(is.null(RizivNumber2)) RizivNumber2 <- NA
xp16 <- "./mod:HcApplianceToHcInstitution/ati:Period/szv:StartDate"
StartDate <- xpathSApply(node, xp16, namespaces=namespace_list, xmlValue)
if(is.null(StartDate)) StartDate <- NA
xp17 <- "./mod:HcApplianceToHcInstitution/ati:Period/szv:EndDate"
EndDate <- xpathSApply(node, xp17, namespaces=namespace_list, xmlValue)
if(is.null(EndDate)) EndDate <- NA

#      xmldoc_df <- data.frame(Origin, CreationDate, OperationType, OperationDate, RizivNumber,
#                              DeviceNumber, DeviceType, Model, StartDateInvoicing, EndDateInvoicing,
#                              LocationIsAddress, IsFixedDevice, IsExtraMuros, stringsAsFactors = FALSE)
}))

提前感谢您的帮助。 附言对之前发布重复的问题表示歉意。

这是您希望实现的目标吗?

library(xml2)
library(dplyr)
xmldoc <- read_xml("./Desktop/test.xml", encoding = "utf-8", as_html = FALSE)
RizivNumber <- xmldoc %>% 
xml_find_all(".//hca:RizivNumber") %>% 
xml_text()
#> RizivNumber
#[1] "00000182" "00000182"
DeviceNumber <- xmldoc %>% 
xml_find_all(".//dev:DeviceNumber") %>% 
xml_text()
#> DeviceNumber
#[1] "30016"
DeviceType <- xmldoc %>% 
xml_find_all(".//dev:DeviceType") %>% 
xml_text()
#> DeviceType
#[1] "PET-CT"

。等等

相关内容

  • 没有找到相关文章

最新更新