我的TOC脚本不生成严格的html标准代码



我写了一个Perl脚本,从HTML页面生成一个内容表,它工作得很好(并生成有效的HTML),除了Perl输出删除了一些元素(如p)的结束标记。这不是对严格的DocType进行验证。

请向下滚动以查看Perl代码。

我该怎么做才能纠正它?

#!/usr/bin/perl -w
#Copyright anurag gupta ; free to use under GNU GPL License
use strict;
use feature "switch";
use Common;
use HTML::Element;
use HTML::TreeBuilder;
#"F:/anurag/work/indiacustomercare/airtel/recharge.html";
my $filename="F:/tmp/t9.html";
my $index=0;
my $labelprefix="anu555ltg-";
my $tocIndex=100001;
my $toc;
my @stack;
my $prevHtag="h2";
sub hTagEncountered($)
{
    my $hTag=shift;
    my $currLevel=(split //, $hTag)[1];
    given($hTag)
    {
        when(/h1/)
        {
           break; 
        }
        default{
            my $countCurr= (split /h/,$hTag)[1];
            my $countPrev= (split /h/,$prevHtag)[1];

            if($countCurr>$countPrev)
            {
                push @stack,($currLevel);
                $toc.="<ul>";
            }
            elsif($countCurr<$countPrev)
            {
                # Now check in the stack
                while ( @stack and $currLevel < $stack[$#stack])
                {
                    pop @stack;
                    $toc.="</ul>";
                }
            }
        }
    }
    $prevHtag=$hTag;
}
sub getLabel
{
my $name=$labelprefix.++$tocIndex;
}
sub traversehtml
{
    my $node=$_[0];
   # $node->dump();
   # print "-----------------n";
   # print $node->tag()."n";
  #  print ref($node),"->n";
    if((ref($node) ne "SCALAR" )and ($node->tag() =~m/^h[2-7]$/i))  #it's an H Element!
    {
        my @h = $node->content_list();
        if(@h==1 and ref($h[0]) eq "SCALAR")  #H1 contains simple string and nothing else
        {
                    hTagEncountered($node->tag());
                    my $label=getLabel();
                    my $a = HTML::Element->new('a', name => $label);
                    my $text=$node->as_trimmed_text();
                    $a->push_content($text);
                    $node->delete_content();
                    $text=HTML::Entities::encode_entities($text);
                    $node->push_content($a);
                    $toc.=<<EOF;
                    <li><a href="#$label">$text</a>
EOF
        }
        elsif (  @h==1 and ($h[0]->tag() eq "a"))   # <h1><a href="abc.com">ttt</a></h1> case
            {
                #See if any previous label already exists
                my $prevlabel = $h[0]->attr("name");

                $h[0]->attr("name",undef) if(defined($prevlabel) and $prevlabel=~m/$labelprefix/); #delete previous name tag if any
                #set the new label
                my $label=getLabel();
                $h[0]->attr("name",$label);
                hTagEncountered($node->tag());
                my $text=HTML::Entities::encode_entities($node->as_trimmed_text());
                $toc.=<<EOF;
                <li><a href="#$label">$text</a>
EOF
            }
        elsif (@h>1)  #<h1>some text here<a href="abc.com">ttt</a></h1> case
        {
           die "h1 must not contain any html elements";
        }
    }
    my @h = $node->content_list();
    foreach my $item (@h)
    {
       if(ref($item) ne "SCALAR")  {traversehtml($item); } #skip scalar items
    }

}
   die "File $filename not found" if !-r $filename;
    my $tree = HTML::TreeBuilder->new();
    $tree->parse_file($filename);

    my @h = $tree->content_list();
    traversehtml($h[1]);
    while(pop @stack)
    {
        $toc.="</ul>";
    }
    $toc="<ul>$toc</ul>";
    print qq{<div id="icctoc"><h2>TOC</h2>$toc</div>};
    my @list1=$tree->content_list();
    my @list2=$list1[1]->content_list();
for(my $i=0;$i<@list2;++$i){
    if(ref($list2[$i]) eq "SCALAR")
       {
        print $list2[$i]
       }
    else{
    print $list2[$i]->as_HTML();
    }

    }
        # Finally:

尝试将%optional_end_tags参数的{}传递给as_HTML

最新更新