使用Python从xml文件中的第二个标记中检索数据



我正在使用simplefied_scrapy库解析xml文件。我需要从第二个<xref-list />中检索信息。但是,文件中的每个条目都包含一个或两个<xref-list />,它们都引用不同的信息。如果有两个<xref-list />,我想从第二个检索信息,如果只有一个<xref-list />,我想只从这一个检索信息。如何仅从第二个检索信息?我试图指定整个层次结构,但它不起作用。我需要为每个对象检索database=""accession=""属性。我将数据库代码与我的数据库词典进行匹配,并将登录项附加到基本url中。

for ele in doc.selects('cell-line'):
key_values = {}
for k in ele:
if k not in ['tag','html']:
key_values[k]=ele[k]

#Cross-reference
database = ele.selects('Cellosaurus>cell-line>xref-list>xref>database()')
accession = ele.selects('Cellosaurus>cell-line>xref-list>xref>accession()')
cross_ref_dict = {database[i]: accession[i] for i in range(len(database))} 
key_values['Cross-ref']=[database_dict.get(k) + cross_ref_dict.get(k) for k in 
set(cross_ref_dict.keys()).intersection(set(database_dict.keys()))]
database_dict = { 'CLO' : 'https://www.ebi.ac.uk/ols/ontologies/clo/terms? 
iri=http://purl.obolibrary.org/obo/',
'EFO' : 'https://www.ebi.ac.uk/efo/',
'ArrayExpress' : 'https://www.ebi.ac.uk/arrayexpress/experiments/',
'ATCC' : 'https://www.atcc.org/Products/All/', # + .aspx
'BioSample' : 'https://www.ncbi.nlm.nih.gov/biosample/?term=',
'CCLE' : 'https://portals.broadinstitute.org/ccle/page?cell_line=',
'Cell_Model_Passport' : 'https://cellmodelpassports.sanger.ac.uk/passports/',
'ChEMBL-Cells' : 'https://www.ebi.ac.uk/chembldb/cell/inspect/',
'ChEMBL-Targets' : 'https://www.ebi.ac.uk/chembldb/target/inspect/',
'Cosmic' : 'https://cancer.sanger.ac.uk/cosmic/sample/overview?id=',
'Cosmic-CLP' : 'https://cancer.sanger.ac.uk/cell_lines/sample/overview?id=',
'dbMHC' : 'https://www.ncbi.nlm.nih.gov/projects/gv/mhc/xslcgi.fcgi?cmd=subj&ID=',
'ECACC' : 'https://www.phe-culturecollections.org.uk/products/celllines/generalcell/detail.jsp?refId=', #&collection=ecacc_g
'DepMap' : 'https://depmap.org/portal/cell_line/',
'GDSC' : 'https://www.cancerrxgene.org/translation/CellLine/',
'GEO' : 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=',
'IPD-IMGT/HLA' : 'https://www.ebi.ac.uk/cgi-bin/imgt/hla/fetch_cell.cgi?',
'IARC_TP53' : 'https://p53.iarc.fr/CellLines.aspx',
'IGRhCellID' : 'http://igrcid.ibms.sinica.edu.tw/cgi-bin/cell_line_view.cgi?cl_name=',
'IHW' : 'https://www.fredhutch.org/en/labs/clinical/projects/ihwg.html',
'KCLB' : 'https://cellbank.snu.ac.kr/english/sub/catalog.php?page=detail&CatNo=59&strQ=', # + &submit1=Find+it
'LiGeA' : 'http://hpc-bioinformatics.cineca.it/fusion/cell_line/',
'LINCS_LDP' : 'http://lincsportal.ccs.miami.edu/cells/#/view/',
'MMRRC' : 'https://www.mmrrc.org/catalog/getSDS.php?mmrrc_id=',
'PharmacoDB' : 'https://pharmacodb.ca/cell_lines/',
'PRIDE' : 'https://www.ebi.ac.uk/pride/archive/projects/',
'RCB' : 'http://cellbank.brc.riken.jp/cell_bank/CellInfo/?cellNo=', # + &lang=En
'Wikidata' : 'https://www.wikidata.org/wiki/',
'test' : 'test'
#'test2' : 'test2'
}

<Cellosaurus>
<header>
</header>
<cell-line-list>
<cell-line category="Factor-dependent cell line" created="2013-02-11" last_updated="2018-09-07" 
entry_version="8">
<accession-list>
<accession type="primary">CVCL_K248</accession>
</accession-list>
<comment-list>
<comment category="Transfected with">
HGNC; 6554; LEPR
<xref-list>
<xref database="HGNC" category="Organism-specific databases" accession="6554">
<property-list>
<property name="gene/protein designation" value="LEPR"/>
</property-list>
<url><![CDATA[https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:6554]]> 
</url>
</xref>
</xref-list>
</comment>
<comment category="Characteristics"> IL3 dependent </comment>
</comment-list>
<species-list>
<cv-term terminology="NCBI-Taxonomy" accession="10090">Mus musculus</cv-term>
</species-list>>
<reference-list>
<reference resource-internal-ref="Patent=US7524937"/>
</reference-list>
<xref-list>
<xref database="ATCC" category="Cell line collections" accession="CRL-12015">
<property-list>
<property name="Discontinued" value="true"/>
</property-list>
</xref>
<xref database="Wikidata" category="Other" accession="Q54752994">
<url><![CDATA[https://www.wikidata.org/wiki/Q54752994]]></url>
</xref>
</xref-list>
</cell-line>
</cell-line-list>
<Cellosaurus>

据我所知,simplefied_scrapy不支持xpath,但lxml支持。因此,如果您将问题中的xml(已更正,最终的<Cellosaurus>需要一个结束标记(传递给lxml,您可以很容易地提取该信息。

from lxml import etree
atcc = """[your xml, corrected]"""
doc = etree.XML(atcc)
for atr in doc.xpath('(//xref-list)[2]/xref[1]'):
print(atr.attrib['database'])
print(atr.attrib['accession'])

输出:

ATCC
CRL-12015

最新更新