R xml parse uniprot features



我想使用 R 从特定的 XML 节点中提取信息:http://www.uniprot.org/uniprot/P38949.xml。它涉及以下块:

    <feature type="initiator methionine" description="Removed" evidence="1">
<location>
  <position position="1"/>
</location>
    </feature>
    <feature type="chain" description="Major pollen allergen Car b 1 isoforms 1A and 1B" id="PRO_0000154185">
<location>
  <begin position="2"/>
  <end position="160"/>
</location>
    </feature>
    <feature type="sequence variant" description="In isoform 1B.">
<original>V</original>
<variation>A</variation>
<location>
  <position position="38"/>
</location>
    </feature>
    <feature type="sequence variant" description="In isoform 1B.">
<original>I</original>
<variation>S</variation>
<location>
  <position position="63"/>
</location>
    </feature>
    <feature type="sequence variant" description="In isoform 1B.">
<original>K</original>
<variation>E</variation>
<location>
  <position position="133"/>
</location>
    </feature>

我尝试使用以下代码提取特征类型="序列变体"节点:

    TabName = "P38949"
    http = paste0("http://www.uniprot.org/uniprot/",TabName,".xml")
    data = xmlParse(http)
    nd = getNodeSet(data, "//ns:entry", namespaces=c(ns=getDefaultNamespace(data)[[1]]$uri))
    original = sapply(nd, xpathSApply, './/*[local-name()="original"]', xmlValue)
    variation = sapply(nd, xpathSApply, './/*[local-name()="variation"]', xmlValue)
    description =  sapply(nd, xpathSApply, './/*[local-name()="feature" and @type="sequence variant"]', xmlGetAttr, 'description')
    position =  sapply(nd, xpathSApply, './/*[local-name()="position"]', xmlGetAttr, 'position')
    table = rbind(table, data.frame(description,original,variation,position))

无法正确提取位置属性,因为脚本还提取了特征类型="引发蛋氨酸"的位置属性,这不是我想要的。输出表应如下所示:

       description original variation position
    In isoform 1B.        V         A       38  
    In isoform 1B.        I         S       63
    In isoform 1B.        K         E      133

使用 rvest(xml2"版本"稍微方便一点(和purrr

library(rvest)
library(purrr)
read_xml('http://www.uniprot.org/uniprot/P38949.xml') %>% 
    xml_nodes(xpath = '//d1:feature[@type="sequence variant"]') %>% 
    map_df(~{
        description <- xml_attr(.x, 'description')
        original    <- xml_node(.x, xpath = 'd1:original') %>%
                             xml_text()
        variation   <- xml_node(.x, xpath = 'd1:variation') %>% 
                             xml_text()
        position    <- xml_node(.x, xpath = 'd1:location/d1:position') %>% 
                             xml_attr('position')
        data.frame(description, original, variation, position,
                   stringsAsFactors = FALSE)
    })
#>      description original variation position
#> 1 In isoform 1B.        V         A       38
#> 2 In isoform 1B.        I         S       63
#> 3 In isoform 1B.        K         E      133
您还可以

获取序列变体节点,flatten xmlToList输出,并将其绑定到表中。

library(purrr)
library(dplyr)
variants <- getNodeSet(data, "//ns:feature[@type='sequence variant']", "ns")
bind_rows(lapply(variants, function(x) flatten(xmlToList(x))))[, c(5,1:3)]
 # A tibble: 3 x 4
      description original variation position
            <chr>    <chr>     <chr>    <chr>
 1 In isoform 1B.        V         A       38
 2 In isoform 1B.        I         S       63
 3 In isoform 1B.        K         E      133

最新更新