使用XMLReader分析PHP中复杂的、嵌套的、带前缀的XML节点



几天来,我一直在搜索、阅读和尝试解析我的XML文件,但到目前为止运气不佳。这是我的一个XML文件示例:

<?xml version="1.0" encoding="windows-1252"?>
<?xml-stylesheet type="text/xsl" href="/rss/styles/shared_xsl_stylesheet_v2.xml"?>
<rss version="2.0">
    <channel>
        <title>All XBRL Data Submitted to the SEC for 2014-10</title>
        <link>http://www.sec.gov/spotlight/xbrl/filings-and-feeds.shtml</link>
        <atom:link href="http://www.sec.gov/Archives/edgar/monthly/xbrlrss-2014-10.xml" rel="self" type="application/rss+xml" xmlns:atom="http://www.w3.org/2005/Atom"/>
        <description>This is a list all of the filings containing XBRL for 2014-10</description>
        <language>en-us</language>
        <pubDate>Mon, 27 Oct 2014 00:00:00 EDT</pubDate>
        <lastBuildDate>Mon, 27 Oct 2014 00:00:00 EDT</lastBuildDate>
        <item>
            <title>Bling Marketing, Inc. (0001593549) (Filer)</title>
            <link>http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/0001014897-14-000441-index.htm</link>
            <guid>http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/0001014897-14-000441-xbrl.zip</guid>
            <enclosure url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/0001014897-14-000441-xbrl.zip" length="30761" type="application/zip" />
            <description>10-Q</description>
            <pubDate>Mon, 27 Oct 2014 17:25:14 EDT</pubDate>
            <edgar:xbrlFiling xmlns:edgar="http://www.sec.gov/Archives/edgar">
                <edgar:companyName>Bling Marketing, Inc.</edgar:companyName>
                <edgar:formType>10-Q</edgar:formType>
                <edgar:filingDate>10/27/2014</edgar:filingDate>
                <edgar:cikNumber>0001593549</edgar:cikNumber>
                <edgar:accessionNumber>0001014897-14-000441</edgar:accessionNumber>
                <edgar:fileNumber>333-192997</edgar:fileNumber>
                <edgar:acceptanceDatetime>20141027172514</edgar:acceptanceDatetime>
                <edgar:period>20140930</edgar:period>
                <edgar:assistantDirector>2</edgar:assistantDirector>
                <edgar:assignedSic>5094</edgar:assignedSic>
                <edgar:fiscalYearEnd>1231</edgar:fiscalYearEnd>
                <edgar:xbrlFiles>
                    <edgar:xbrlFile edgar:sequence="1" edgar:file="bling10q3q14v2.htm" edgar:type="10-Q" edgar:size="174242" edgar:description="FORM 10-Q" edgar:url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/bling10q3q14v2.htm" />
                    <edgar:xbrlFile edgar:sequence="2" edgar:file="bling10q3q14ex31.htm" edgar:type="EX-31" edgar:size="5481" edgar:description="EXHIBIT 31" edgar:url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/bling10q3q14ex31.htm" />
                    <edgar:xbrlFile edgar:sequence="3" edgar:file="bling10q3q14ex32.htm" edgar:type="EX-32" edgar:size="1827" edgar:description="EXHIBIT 32" edgar:url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/bling10q3q14ex32.htm" />
                    <edgar:xbrlFile edgar:sequence="4" edgar:file="blmi-20140930.xml" edgar:type="EX-101.INS" edgar:size="149179" edgar:description="XBRL INSTANCE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/blmi-20140930.xml" />
                    <edgar:xbrlFile edgar:sequence="5" edgar:file="blmi-20140930.xsd" edgar:type="EX-101.SCH" edgar:size="28373" edgar:description="XBRL TAXONOMY EXTENSION SCHEMA DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/blmi-20140930.xsd" />
                    <edgar:xbrlFile edgar:sequence="6" edgar:file="blmi-20140930_cal.xml" edgar:type="EX-101.CAL" edgar:size="7021" edgar:description="XBRL TAXONOMY EXTENSION CALCULATION LINKBASE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/blmi-20140930_cal.xml" />
                    <edgar:xbrlFile edgar:sequence="7" edgar:file="blmi-20140930_def.xml" edgar:type="EX-101.DEF" edgar:size="17205" edgar:description="XBRL TAXONOMY EXTENSION DEFINITION LINKBASE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/blmi-20140930_def.xml" />
                    <edgar:xbrlFile edgar:sequence="8" edgar:file="blmi-20140930_lab.xml" edgar:type="EX-101.LAB" edgar:size="74477" edgar:description="XBRL TAXONOMY EXTENSION LABEL LINKBASE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/blmi-20140930_lab.xml" />
                    <edgar:xbrlFile edgar:sequence="9" edgar:file="blmi-20140930_pre.xml" edgar:type="EX-101.PRE" edgar:size="67806" edgar:description="XBRL TAXONOMY EXTENSION PRESENTATION LINKBASE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/blmi-20140930_pre.xml" />
                </edgar:xbrlFiles>
            </edgar:xbrlFiling>
        </item>
        <item>
            <title>Primco Management Inc. (0001516522) (Filer)</title>
            <link>http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/0001014897-14-000414-index.htm</link>
            <guid>http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/0001014897-14-000414-xbrl.zip</guid>
            <enclosure url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/0001014897-14-000414-xbrl.zip" length="80939" type="application/zip" />
            <description>10-K/A</description>
            <pubDate>Tue, 30 Sep 2014 17:41:38 EDT</pubDate>
            <edgar:xbrlFiling xmlns:edgar="http://www.sec.gov/Archives/edgar">
                <edgar:companyName>Primco Management Inc.</edgar:companyName>
                <edgar:formType>10-K/A</edgar:formType>
                <edgar:filingDate>10/01/2014</edgar:filingDate>
                <edgar:cikNumber>0001516522</edgar:cikNumber>
                <edgar:accessionNumber>0001014897-14-000414</edgar:accessionNumber>
                <edgar:fileNumber>000-54930</edgar:fileNumber>
                <edgar:acceptanceDatetime>20140930174138</edgar:acceptanceDatetime>
                <edgar:period>20131231</edgar:period>
                <edgar:assistantDirector>8</edgar:assistantDirector>
                <edgar:assignedSic>6531</edgar:assignedSic>
                <edgar:fiscalYearEnd>1231</edgar:fiscalYearEnd>
                <edgar:xbrlFiles>
                    <edgar:xbrlFile edgar:sequence="1" edgar:file="primco10k13am2v2.htm" edgar:type="10-K/A" edgar:size="482147" edgar:description="FORM 10-K/A" edgar:url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/primco10k13am2v2.htm" />
                    <edgar:xbrlFile edgar:sequence="2" edgar:file="primco10k13ex31.htm" edgar:type="EX-31" edgar:size="10412" edgar:description="EXHIBIT 31" edgar:url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/primco10k13ex31.htm" />
                    <edgar:xbrlFile edgar:sequence="3" edgar:file="primco10k13ex32.htm" edgar:type="EX-32" edgar:size="3121" edgar:description="EXHIBIT 32" edgar:url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/primco10k13ex32.htm" />
                    <edgar:xbrlFile edgar:sequence="4" edgar:file="pmcm-20131231.xml" edgar:type="EX-101.INS" edgar:size="891933" edgar:description="XBRL INSTANCE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/pmcm-20131231.xml" />
                    <edgar:xbrlFile edgar:sequence="5" edgar:file="pmcm-20131231.xsd" edgar:type="EX-101.SCH" edgar:size="54127" edgar:description="XBRL TAXONOMY EXTENSION SCHEMA DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/pmcm-20131231.xsd" />
                    <edgar:xbrlFile edgar:sequence="6" edgar:file="pmcm-20131231_cal.xml" edgar:type="EX-101.CAL" edgar:size="12529" edgar:description="XBRL TAXONOMY EXTENSION CALCULATION LINKBASE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/pmcm-20131231_cal.xml" />
                    <edgar:xbrlFile edgar:sequence="7" edgar:file="pmcm-20131231_def.xml" edgar:type="EX-101.DEF" edgar:size="77249" edgar:description="XBRL TAXONOMY EXTENSION DEFINITION LINKBASE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/pmcm-20131231_def.xml" />
                    <edgar:xbrlFile edgar:sequence="8" edgar:file="pmcm-20131231_lab.xml" edgar:type="EX-101.LAB" edgar:size="146832" edgar:description="XBRL TAXONOMY EXTENSION LABEL LINKBASE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/pmcm-20131231_lab.xml" />
                    <edgar:xbrlFile edgar:sequence="9" edgar:file="pmcm-20131231_pre.xml" edgar:type="EX-101.PRE" edgar:size="131110" edgar:description="XBRL TAXONOMY EXTENSION PRESENTATION LINKBASE DOCUMENT" edgar:url="http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/pmcm-20131231_pre.xml" />
                </edgar:xbrlFiles>
            </edgar:xbrlFiling>
        </item>
    </channel>
</rss>

XML文件大约为30MB,使用simpleXML解析应该可以,但问题在于simpleXML无法处理像<edgar:formType>这样的标记。我猜它们还不够"简单";-)

因此,我尝试使用XMLReader进行解析,能够解析这些标记。例如:

// Initialize XMLReader and DOMdocument
$reader = new XMLReader();
$reader->open("file.xml");
$storage = array();
// Move to the first <item> node
while ($reader->read() && $reader->name !== "item");
// Loop through the entire instance document
while ( $reader->read() ) {
    // Ensure that nodeType is an Element and not an Attribute or Text
    if($reader->nodeType == XMLReader::ELEMENT) {
        // Extract and store filing info in $storage array
        switch ($reader->localName) {
          case "formType":
            $reader->read();
            $storage["formType"] = $reader->value;
            break;
          case "cikNumber":
            $reader->read();
            $storage["cik"] = $reader->value;
            break;
          default:
            break;
        }       
    echo "<pre>"; print_r($storage); echo "</pre>";
    }
}

不过,print_r返回了许多空的和重复的结果。。

我想循环遍历每个<item>,并使用MySQLi:存储以下数据点

  • <guid>
  • <edgar:companyName>
  • <edgar:formType>
  • <edgar:filingDate>
  • <edgar:cikNumber>
  • <edgar:accessionNumber>
  • <edgar:period>
  • <edgar:fiscalYearEnd>
  • 如果属性edgar:description="XBRL INSTANCE DOCUMENT",则<edgar:xbrlFile>节点的edgar:url属性

我觉得我已经接近了,因为我已经能够基于localName(它针对edgar:之后的零件)提取值,但我不知道如何访问属性,也不知道如何在数据库中存储每个<item>的数据。

非常感谢这里的帮助,因为我在网上找到的示例都没有显示如何处理这些带前缀的XML标记。提前感谢!

Nick

try item->children('edgar',true)->。。。以解析它们。我认为这将允许您使用simplexml。edgar:被称为名称空间,在xml文件中经常使用。不久前我遇到了同样的问题,这为我修复了它,

由于您最初尝试使用SimpleXML进行解析,因此这里有一个使用SimpleXML解析XML的解决方案,特别是它的xpath方法,它为您提供了一种非常简单、方便的方法来选择XML文档中的节点。的大部分

# you will probably be loading the XML from a file here rather than a string...
$sxe = simplexml_load_string( $xml );
# this xpath looks for "item" elements that are under the "channel" element    
foreach ($sxe->xpath("channel/item") as $i) {
    # for this example, I'll just store the data and print it after parsing each item
    $data = array();
    # cast the node as a string
    $data['guid'] = (string)$i->guid;
    # register the URI associated with the 'edgar' namespace
    # tags can be referred to using "e:tagName" from now on
    $i->registerXPathNamespace("e", "http://www.sec.gov/Archives/edgar");
    foreach ( array("companyName", "formType", "filingDate", "cikNumber",
        "accessionNumber", "period", "fiscalYearEnd") as $tag) {
        # create the xpath dynamically from the tag name. All tags are under the "item"
        # node ($i) under the parent edgar:xbrlFiling (i.e. e:xbrlFiling)
        $data[ $tag ] = (string)$i->xpath("e:xbrlFiling/e:$tag")[0];
    }
    # this searches for e:xbrlFile nodes with description "XBRL INSTANCE DOCUMENT"
    # the final /@e:url returns the e:url attribute, rather than the node itself 
    foreach ($i->xpath(
  "e:xbrlFiling/e:xbrlFiles/e:xbrlFile[@e:description='XBRL INSTANCE DOCUMENT']/@e:url"
                           ) as $url) {
        # you may want to use an array here if there are several such URLs
        $data['url'] = (string)$url;
    }
    print_r($data);
}

输出:

Array
(
    [guid] => http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/0001014897-14-000441-xbrl.zip
    [companyName] => Bling Marketing, Inc.
    [formType] => 10-Q
    [filingDate] => 10/27/2014
    [cikNumber] => 0001593549
    [accessionNumber] => 0001014897-14-000441
    [period] => 20140930
    [fiscalYearEnd] => 1231
    [url] => http://www.sec.gov/Archives/edgar/data/1593549/000101489714000441/blmi-20140930.xml
)
Array
(
    [guid] => http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/0001014897-14-000414-xbrl.zip
    [companyName] => Primco Management Inc.
    [formType] => 10-K/A
    [filingDate] => 10/01/2014
    [cikNumber] => 0001516522
    [accessionNumber] => 0001014897-14-000414
    [period] => 20131231
    [fiscalYearEnd] => 1231
    [url] => http://www.sec.gov/Archives/edgar/data/1516522/000101489714000414/pmcm-20131231.xml
)

最新更新