这段perl代码处理一个文件夹中的txt文件,它的子目录分为标题、正文和xml。
#!perl -w
use strict;
use utf8;
use File::Copy;
use File::Basename;
our @folders=();
our %errors=();
our $page_errors='';
our $folder_out='';
our $folder_in='';
our $sub_folder="";
our $dev=0;
our $anker='#a_';
our $coded_lb=0;
our $line_cnt=0;
sub get_complete_filename
{
my $return = $_[0];
$return=~m/([^d]+)(d+)/;
return $return if (!$1 || !$2);
my $name=$1;
my $number=$2;
open (IN,"<:encoding(utf-8)","..\..\complete_filenames.txt");
while (<IN>)
{
my $line=$_; chomp($line);
next if ($line eq '' || $line=~m/Datei/);
if ($line=~m/$name[a-zA-Z_-]+$number/)
{
$return = $line;
last;
}
}
close IN;
return $return;
}
sub get_files
{
my $dir = $_[0];
my $file;
opendir(DIR, $dir) || die "Unable to open $dir: $!";
my @fl = grep {!/^..?$/ } readdir(DIR);
closedir(DIR);
foreach (@fl)
{
if (-d ($file = "$dir\$_"))
{
push(@folders,$file);
get_files($file);
}
}
}
sub header
{
my $fn=$_[0];
my $folder_in=$_[1];
my $folder_out=$_[2];
if (-e ($folder_in."\".$fn.".teih"))
{
open (IN,"<:encoding(utf-8)", $folder_in."\".$fn.".teih");
my $input = do { local $/; <IN> };
close IN;
my @lines=split(/[nr]/,$input);
my $read=0;
my $output="";
foreach my $line (@lines)
{
chomp($line);
if($line=~m/</teiHeader>/)
{
$read=0;
$output.=$line."n";
last;
}
elsif ($read eq 1 || $line=~m/<teiHeader>/)
{
$read=1;
$output.=$line."n";
}
}
open (OUT,">:encoding(utf-8)", $folder_out.($dev eq 0 ? "\".$sub_folder : "")."\".$fn.".teih");
print OUT $output;
close OUT;
#copy($folder_in."\".$fn.".teih", $folder_out."\".$fn."\".$fn.".teih");
}
else
{
open (H,">:encoding(utf-8)", $folder_out.($dev eq 0 ? "\".$sub_folder : "")."\".$fn.".teih");
print H "nt<!--nt copy of the main_header or empty header!!!nt please update its contentnt-->n".
'<teiHeader>
<fileDesc>
<titleStmt>
<title/>
<respStmt>
<resp/>
<name/>
</respStmt>
</titleStmt>
<publicationStmt>
<distributor/>
</publicationStmt>
<sourceDesc>
<bibl/>
</sourceDesc>
</fileDesc>
</teiHeader>';
close H;
}
return '<?xml version="1.0" encoding="utf-8"?>';
}
sub check_linebreak
{
my $line=$_[0]; my $pg=$_[1];
my $ret="";
$line_cnt++;
if ($line=~m/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ-]+)/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ-]+[[:punct:]]*)/)
{
my $tmp1=$1; my $tmp2=$2;
my $z="TRENNENDERZBTRENNENDERZB".($line_cnt+1)."TRENNENDERZBTRENNENDERZB";
$line=~s/Q$tmp1E/Q$tmp2E/$tmp1$z$tmp2/; $line.=" ";
if ($coded_lb eq 0)
{
$ret=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
}
else
{
$ret=$pg.$line;
}
$coded_lb=1;
}
else
{
if ($coded_lb eq 0)
{
$ret=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
}
else
{
$ret=$pg.$line;
}
$coded_lb=0;
}
return $ret;
}
sub anfangs_verarbeitung
{
my $tmp=$_[0];
$tmp =~ s/^x{FEFF}//; # removes BOM
$tmp =~ s/#(?:(?:rn)|n|r)+(-{2,})/#$1n/mg;
$tmp =~ s/^ps*$//g;
$tmp =~ s/^s*([pP]d+)s*([cC]s*[0-9IVX]+)/$1n$2/g;
$tmp =~ s/(?<=#)|(?=[pppctPCT])//g;
$tmp =~ s/|(?=[pppctPCT])/#/g;
$tmp =~ s/(?<![|#])([pppcPC]s*[dIVXMC]+)+/#$1/g;
$tmp =~ s/|(d+)/#p$1/g;
$tmp =~ s/«(?=[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/»/g;
$tmp =~ s/»(?![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/«/g;
$tmp =~ s/<<(?=[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/»/g;
$tmp =~ s/>>(?![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/«/g;
$tmp =~ s/(?:„|,,|")([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+)«/»$1«/g;
$tmp =~ s/»([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+)[“"']/»$1«/g;
$tmp =~ s/^(d{4}s*წ.)s*$/#d $1/g;
$tmp =~ s/<pol>/<pol>/g;
$tmp =~ s/<ა>/<a>/g;
$tmp =~ s/<?([athzee])>/</$1>/g; #<?a>
$tmp =~ s/[<>]/([athzee])(?![<>])/</$1>/g; #</a >/a
$tmp =~ s/<([athzee])/>/</$1>/g; #<a/>
$tmp =~ s/[<>]/([athzee])[<>]/</$1>/g; #>/a> etc.
$tmp =~ s/[<>]([athzee])[<>]/<$1>/g; #>a< etc.
$tmp =~ s/<([athzee])(?![<>])/<$1>/g; #<a >a
$tmp =~ s/(?<=[^></#])([athzee])[<>]/<$1>/g; #a< a>
$tmp =~ s/(?<=[^><])/([athzee])[<>]/</$1>/g; #/a< /a>
#$tmp =~ s/</([athze])>([^<]+)</[^1]>/<$1>$2</$1>/gm;
##$tmp =~s/<([athz])>([^<]+)</[^(?:$1)]>/<$1>$2</$1>/g;
##$tmp =~s/<([athz])>([^<]+)<[^(?:1)]>/<$1>$2</$1>/g;
$tmp =~ s/<([pol])>([^<]+)</1>-<1>([^<]+)</1>/<$1>$2-$3</$1>/g;
#$tmp =~ s/<([athze])>([^<]+)</1>[--]<([athze])>([^<]+)</3>/<$1>$2-$4</$1>/g;
##$tmp =~ s/([^s]+)-<([athz])>([^<]+)</2>/<$2>$1-$3</$2>/g;
##$tmp =~ s/<name([^>]+)>([^<]+)</name>//g;
$tmp =~ s/</</</g;
####$tmp =~ s/</(?![athzee])//g;
$tmp =~ s/#{2,}/#/g;
$tmp =~ s/(//?([^)]*))/<unclear>$1</unclear>/gm;
$tmp =~ s/<unclear></unclear>/<unclear/>/g;
$tmp =~ s/(//([^)]+))/<corr>$1</corr>/gm;
#$tmp =~ s/<s(d+)>([^<]+)</s1>/$2<ref target="#a$1" type="noteAnchor">$1</ref>/gm;
$tmp =~ s/<[sS](d+)>/<ref target="a$1" type="noteAnchor">/g;
$tmp =~ s/</[sS]d+>/</ref>/g;
$tmp =~ s/([sS](d+)=?s*([^)]+))/<note xml:id="a$1" type="footnote">$2</note>n/gm;
#$tmp =~ s/#f(d+)s*(.*)([^#|]+)/<note xml:id="a$1" type="footnote">$2</note>nn/gm;
#$tmp =~ s/(ss*(d+)s*([^)]+))/<note xml:id="a$1" type="footnote">$2</note>nn/gm;
$tmp =~ s/n{1,}</note>/</note>/gm;
#$tmp =~ s/s*#-{2,}//gm;
$tmp=~s/ვი$/ჳ/g;
$tmp=~s/ხ$/ჴ/g;
$tmp=~s/ე$/ჱ/g;
$tmp=~s/ი$/ჲ/g;
$tmp=~s/ფ$/ჶ/g;
$tmp=~s/ვ$/უ/g;
$tmp=~s/ო$/ჵ/g;
$tmp=~s/#.{2,}/#--------------/g;
return $tmp;
}
sub end_verarbeitung
{
my $tmp=$_[0];
$tmp =~ s/[nr]{2,}/n/g;
$tmp =~ s/<p>s+/<p>/g;
$tmp =~ s/</p>s+/</p>/g;
$tmp =~ s/<p></p>//g;
$tmp =~ s/<div><p><div type="dateline">/<div type="dateline">/g;
$tmp =~ s/<p><div type="dateline">/<div type="dateline">/g;
$tmp =~ s/<pol>([^<]+)</pol>/<term type="political">$1</term>/g;
$tmp =~ s/<term type="political"> ([^<]+)</name>/ <term type="political">$1</term>/g;
$tmp =~ s/<a><name/<name/g;
$tmp =~ s/<t>([^<]+)</t>/<name type="toponym">$1</name>/g;
$tmp =~ s/<z>([^<]+)</z>/<name type="zoonym">$1</name>/g;
$tmp =~ s/<h>([^<]+)</h>/<name type="hydronym">$1</name>/g;
$tmp =~ s/<e>([^<]+)</e>/<name type="ethnonym">$1</name>/g;
#$tmp =~ s/<a>([^<]+)/<name type="anthroponym">$1</name>/g;
#$tmp =~ s/([^>]+)</a>/<name type="anthroponym">$1</name>/g;
$tmp =~ s/<u>([^<]+)<?/u>/<name type="unknown">$1</name>/g;
$tmp =~ s/s+([.:,!?)])/$1/g;
$tmp =~ s/(()s+/$1/g;
$tmp=~s/<p>#</p>//g;
$tmp=~s/<div></div>//g;
$tmp=~s/.s+./../g;
$tmp=~s/..(?!<.)/.../g;
$tmp=~s/.../…/g;
$tmp=~s/…s*./…/g;
$tmp=~s/ +([,.…;:!?])/$1/g;
#$tmp=~s/([,.…;:!?])(?!< )/$1 /g;
$tmp=~s/-/–/g;
$tmp=~s/,–/, –/g;
$tmp=~s/([.:,!?)])–/$1 -/g;
$tmp=~s/. </.</g;
$tmp=~s/xml: id/xml:id/g;
$tmp=~s/#-{2,}//g;
$tmp=~s/<p></p>//g;
$tmp=~s/s*</p><p>/</p>ntttt<p>/g;
$tmp=~s/ +/ /g;
#$tmp =~ s/„([^„“]+)„/„$1“/g;
#$tmp=~s/<pb n="(d+)"/>(?:rn)*n*s*</div>/<pb n="$1"/>/gm;
#$tmp=~s/<div type="Section">(?:rn)*n*s*<head>([^<]+)</head>/<div type="Section">ntttt<head>$1</head>ntttt</div>/gm;
#$tmp=~s/s*<pb n="(d+)"/>(?:rn)*n*s*<div type="Section">/<div type="Section">ntttt<pb n="$1"/>/gm;
$tmp=~s/</p><lg>/</p>ntttt<lg>/g;
$tmp=~s/</p></div>/</p>nttt</div>/g;
$tmp=~s/(<name[^>]*>) +/ $1/g;
$tmp=~s/([^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]*) +</name>/</name>$1 /g;
$tmp=~s/…s*</name>/</name>…/g;
$tmp=~s/,s*././g;
$tmp=~s/ +/ /g;
$tmp=~s/NORMALERZBNORMALERZB(d+)NORMALERZBNORMALERZB/nttttt<lb n="$1"/> /g;
$tmp=~s/TRENNENDERZBTRENNENDERZB(d+)TRENNENDERZBTRENNENDERZB/<lb n="$1"/>/g;
$tmp=~s/PAGE PAGE PAGE PAGE PAGE(d+)PAGE PAGE PAGE PAGE PAGE/<pb n="$1"/>/g; #//<pb n="".$current_page.""/>";
$tmp=~s/</p>(<pb n="d+"/>)/</p>ntttt$1/g;
$tmp=~s/ (<pb n="d+"/>)/$1/g;
$tmp=~s/</p>[rn]+s+<p>(<pb n="d+"/>)</p>/$1</p>/g;
$tmp=~s/</l>(<pb n="d+"/>)/$1</l>/g;
$tmp=~s/ +/ /g;
$tmp=~s/<a><name/<name/g;
$tmp=~s/<head></head>//;
my $sperr="";
if ($_[1]!~m/(?:04|07|11).1857/ && $_[1]!~m/(?:04|08).1858/)
{
while ($tmp=~m/(?<![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])((?:[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ][^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ~–-]){3,})/)
{
my $sperr_org=$1;
my $sperr_edit=$1;
my $rest="";
$sperr_edit=~s/ //g;
$sperr.=$sperr_edit."n";
if ($sperr_edit=~m/([^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+)$/)
{
$rest=($1 ne "<"?" ":"").$1;
$sperr_edit=~s/[^აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+$//;
}
else { $rest=" "; }
$tmp=~s/(?<![აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])Q$sperr_orgE/<hi rend="letter-spacing">$sperr_edit</hi>$rest/;
}
if ($sperr ne "")
{
open (OUT,">>:encoding(utf-8)", $_[2]."\000_sperrschrift.txt");
print OUT $_[1]."nt".$sperr;
close OUT;
}
}
$tmp=~s/>([^<])</name>./>$1.</name>/g;
return $tmp;
}
sub go_go_gadget
{
my $file_xml=$_[0];
my $file_html=''; my $output=''; my $output_filename='';
my $chapter=0; my $div=0; my $p=0; my $last_p=0; my $v=0;
my $input_xml=''; my $chapter_type=''; my $written=0;
my $page=0; my $started=0;
(my $fn,my $pn)=fileparse $file_xml;
return if ($fn=~m/instruqcia/);
print "tkonvertiere $fnn";
$fn=~s/.txt//g;
$fn=~s/(d+)_/$1+/;
$fn=~s/_/-/g; $fn=~s/^([a-z]+)-/$1_/g;
$sub_folder="";
if ($fn=~m/^([^_]+_[a-zA-Z]+)/)
{
$sub_folder=$1;
}
$file_xml=~s/(?:/|\+)/\/g;
open (IN,"<:encoding(utf-8)", $file_xml) || die "konnte die datei nicht oeffnen: $!n";
$input_xml = do { local $/; <IN> } ; # Eingabedatei komplett in String einlesen
close IN;
# -----------------------------------------
$input_xml=anfangs_verarbeitung($input_xml);
# -----------------------------------------
$div=0;
my $last_line='';
my @lines=split(/n/,$input_xml);
$line_cnt=0;
my $group_cnt=0;
my $verse_cnt=0;
my $pg='';
my $first_page=0;
my $last_page=0;
my $has_chapters=0;
my $ut=0;
my $quote_open=0;
my $section_cnt=0;
my $chapter_cnt=0;
$coded_lb=0;
$chapter_type="Section";
$has_chapters=1 if ($input_xml=~m/#s*[cC]s*[dIVXMC]+[–-]?[dIVXMC]*/);
if ($has_chapters eq 0)
{
$output='<div type="Section">' ;
#$chapter=1;
}
$output='<div type="Content" n="1">'."n";
foreach my $line (@lines)
{
$line=~s/^Ls*//;
chomp($line); $line=~s/n//g; $line=~s/r//g; $line=~s/(s){2,}/$1/g; $line =~ s/^s+//g; $line =~ s/s+$//g;
#$line=~s/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])([,;.])([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ])/$1$2 $3/g;
if ($has_chapters eq 1 && $line =~ m/^s*#?s*|?[cC]s*(d+)s*(.*)/) # chapter
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>"; }
elsif ($v eq 1) { $output.="ntttt</lg>"; }
if ($started eq 1)
{
if ($div eq 1) { $output.="nttt</div>"; }
elsif ($chapter eq 1) { $output.="nttt</div>"; }
}
my $title=$2;
if ($title) { $title=~s/<ref target="#a(d+)" type="noteAnchor">/<ref target="#a_$page_$1" type="noteAnchor">/g; }
$chapter_cnt=$1;
$output.="nttt".'<div type="Chapter" n="'.$chapter_cnt.'">'."ntttt<head>".($title?check_linebreak($title,$pg):$pg)."</head>";
$chapter=1; $chapter_type="Chapter"; #$div=0;
$p=0; $written=0; $v=0; $ut=0;
$last_line=""; $started=0;
$pg='';
}
elsif ($line =~ m/^s*#s*[pP]s*(d+)/) # page break
{
if ($v ne 1 && $p eq 0)
{
if ($div == 0)
{
if ($chapter_type eq "Section" || $has_chapters == 0) { $section_cnt++; $output.='<div type="Section" n="'.$section_cnt.'">'; }
else { $output.='<div type="Chapter" n="'.$chapter_cnt.'">'; }
$div=1;
}
$output.="<p>"; $p=1;
}
#$output.="<pb n="".$1.""/>";
# --- detecting page errors
my $current_page=$1; #0;
#if ($first_page > 0)
#{
# $current_page=$1;
# if ($current_page-$last_page<1)
# {
# $current_page=$last_page+1;
# $page_errors.=$fn."t".$last_page."n";
# }
# elsif ($current_page-$last_page>1)
# {
# $page_errors.=$fn."t".$last_page."n";
# }
#}
#else
#{
# $first_page=$1;
# $current_page=$1;
#}
#$last_page=$current_page;
# ----
$pg.="PAGE PAGE PAGE PAGE PAGE".$current_page."PAGE PAGE PAGE PAGE PAGE";
#$p=0;
$page=$1;
#$written=0;
$last_line="";
$line_cnt=0;
}
elsif ($line =~ m/s*#[tT]s*(.+)/) # title
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>"; }
elsif ($v eq 1) { $output.="ntttt</lg>"; }
if (($chapter eq 1 || $div eq 1) && $chapter_type ne 'chapter')
{
# if($chapter_type eq 'chapter')
# {
# if ($started eq 1)
# { $output.="nttt</div>nttt".'<div type="Chapter" n="'.$1.'">';}
# else { $output.='<div type="Chapter" n="'.$1.'">';}
# }
# else
# {
if ($started eq 1) { $section_cnt++; $output.="nttt</div>nttt".'<div type="Section" n="'.$section_cnt.'">'; }
else { $section_cnt++; $output.="nttt".'<div type="Section" n="'.$section_cnt.'">'; }
# }
}
else
{
#$section_cnt++;
#$output.='<div type="Section" n="'.$section_cnt.'">';
#$div=1;
}
#$line_cnt++;
$output.="ntttt<head>".$pg.$1."</head>";
$pg='';
$p=0; $written=0; $v=0; $ut=0;
$last_line=''; $started=1;
}
elsif ($line =~ m/#vs*(.+)/) # verse
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>" ;}
if ($v eq 0) { $group_cnt++; $verse_cnt=0; $output.="ntttt".'<lg n="'.$group_cnt.'">'; }
$verse_cnt++;
$last_line=$1;
$line_cnt++;
$output.="nttttt".'<l n="'.$verse_cnt.'">'.$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB".$1."</l>";
$p=0;
$written=1; $v=1;
$started=1; $pg=''; $ut=0;
}
elsif ($line=~m/#s*-{2,}/) #elsif ($line eq '' && $last_line ne '') # && $last_line!~m/[.!?]s*$/) # paragraph
{
if ($written eq 1)
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>"; $p=0; }
elsif ($v eq 1 && $written eq 0) { $output.="nttt</lg>ntttt"; $v=0; }
}
#if ($p eq 0 && $v eq 0) { $output.="ntttt<p>"; $p=1; }
$written=0; $last_line=''; $ut=0;
}
elsif ($line =~ m/^(?:#d)?s*(d{4}s*წ.)$/ || $line=~m/^s*(d{4}(?: – d+s*წ*.)?s*)$/ || $line=~m/^s*([0-9]+s*[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]+s*[0-9]+s*[აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ]*)$/) # dateline
{
if ($div eq 1 || $chapter eq 1)
{
$output.="</note>" if ($ut == 1);
if ($p eq 1) { $output.="</p>"; }
elsif ($v eq 1) { $output.="ntttt</lg>"; }
$output.="nttt</div>";
$chapter=0; $div=0; $ut=0;
}
$line_cnt++;
$output.="ntttt<div type="dateline"><p>".$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB".$1."</p></div>";
$p=0; $written=0; $v=0; $pg='';
$last_line="";
}
elsif ($div eq 1 || $chapter eq 1 || ($div eq 0 && $chapter eq 0))
{
if ($line!~m/^s*$/)
{
$output.="ntttt</lg>" if ($v eq 1);
if ($div eq 0 && $chapter eq 0) { $div=1; $section_cnt++; $output.="nttt".'<div type="Section" n="'.$section_cnt.'">';}
if ($p eq 0) { $output.="ntttt<p>"; }
$line=~s/s*#s*//g;
# --- quotes
$line=~s/([.,;?!:])„/$1“/g; #„ “
if ($line=~m/^s*„/ && $line!~m/“/ && $line=~m/[.?!:]+s*$/)
{
$line.="“";
}
else
{
$line=~s/„//g;
}
$line=~s/“//g if ($line=~m/“/ && $line!~m/„/);
# ---
# --- ref
$line=~s/<ref target="a(d+)" type="noteAnchor">/<ref target="#a_$page_$1" type="noteAnchor">/g;
$line=~s/<note xml:id="a(d+)" type="footnote">/<note xml:id="a_$page_$1" type="footnote">/g;
# ---
if ($line=~m/|ut/)
{
$line=~s/|ut/<note type="comment">/;
$ut=1;
}
$output.=check_linebreak($line,$pg);
#$line_cnt++;
#if ($line=~m/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ-]+)/([აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶ-]+[[:punct:]]*)/)
#{
# my $tmp1=$1; my $tmp2=$2;
# my $z="TRENNENDERZBTRENNENDERZB".($line_cnt+1)."TRENNENDERZBTRENNENDERZB";
# $line=~s/Q$tmp1E/Q$tmp2E/$tmp1$z$tmp2/; $line.=" ";
# if ($coded_lb eq 0)
# {
# $output.=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
# }
# else
# {
# $output.=$pg.$line;
# }
# $coded_lb=1;
#}
#else
#{
# if ($coded_lb eq 0)
# {
# $output.=$pg."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB ".$line;
# }
# else
# {
# $output.=$pg.$line;
# }
# $coded_lb=0;
#}
#$output.=$pg." "."NORMALERZBNORMALERZB".$line_cnt."NORMALERZBNORMALERZB".$line;
$last_line=$line;
$p=1; $written=1; $v=0; $started=1;$pg='';
}
}
}
if ($p eq 1) { $output.="</p>";}
elsif ($v eq 1) { $output.="ntttt</lg>"; }
if ($div eq 1) { $output.="nttt</div>"; }
elsif ($chapter eq 1) { $output.="nttt</div>"; }
# -----------------------------------------
$output=end_verarbeitung($output,$fn,$folder_out)."</div>";
# -----------------------------------------
#$fn=get_complete_filename($fn);
mkdir($folder_out."\".$sub_folder,0777) if ($dev eq 0 && !(-d $folder_out."\".$sub_folder));
my $txt='<text rend="Section" xml:lang="kat">';
$txt='<text rend="'.($section_cnt?"Section ":"").'Chapter" xml:lang="kat">' if ($has_chapters eq 1);
$output=header($fn,$folder_in,$folder_out)."nt".$txt.'
<body>
'.$output.'
</body>
</text>';
$output=~s/(<body>(?:rn)*s*<pb n="d+"/>)(?:rn)*s*</div>/$1/g;
$output_filename=$folder_out.($dev eq 0 ? "\".$sub_folder : "")."\".$fn.".xml";
open (OUT, ">:encoding(utf-8)", $output_filename);
print OUT '<?xml version="1.0" encoding="utf-8"?>'."n".'<!DOCTYPE TEI [
<!ENTITY header SYSTEM "'.$fn.'.teih">
<!ENTITY text SYSTEM "'.$fn.'.txml">
]>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:gnc="http://iness.uib.no/ns/1.0">
&header;
&text;
</TEI>';
close (OUT);
$output_filename=~s/.xml/.txml/gi;
open (OUT, ">:encoding(utf-8)",$output_filename) || die "konnte die ausgabedatei "$output_filename" nicht oeffnen: $!n";
print OUT $output;
close OUT;
}
sub main
{
print "nBeginne...n";
my $root="D:\bla";
my @startfolders=($root."\");
$folder_out="D:\bla";
foreach my $startfolder(@startfolders)
{
@folders=();
get_files($startfolder);
if (scalar(@folders)<1) { push(@folders,$startfolder); }
$root=~s/0_Eingabe/1_Ausgabe\1/;
foreach $folder_in(@folders)
{
$page_errors='';
$folder_out=$folder_in;
$folder_out=~s/0_Eingabe/1_Ausgabe/;
# creating subfolders too
#my $tmp=$folder_out;
#$tmp=~s/Q$rootE//;
#my @arr_tmp=split("\\",$tmp);
#$tmp="";
#foreach my $dings (@arr_tmp)
#{
# next if ($dings eq '');
# $tmp.="\".$dings;
# mkdir($root.$tmp,0777) if (!(-d $root.$tmp));
#}#
# -----
$folder_out=~s/\+/\/g;
$dev = 1; # entwicklermodus an bei 1
$folder_out=~s/1_Ausgabe.*/1_Ausgabe/ if ($dev eq 1);
print "Ordner ".$folder_in."n";
foreach my $file_xml(<${folder_in}/*.txt>)
{
go_go_gadget($file_xml);
}
next;
if ($page_errors ne '')
{
$folder_in=~m/0_Eingabe\(.+)/;
my $tmp=$1;
$tmp=~s/\+/__/g;
open (OUT, ">:encoding(utf-8)", $root."\".$tmp.".txt") || die "ntPage errors to file ".$tmp.": ".$!."n";
print OUT $page_errors;
close OUT;
}
}
}
print "Fertig!nn";
}
main();
然而,有些文件处理时间太长。如果超过6秒,我就跳过其中任何一个步骤。这样,如果处理过的文件转换时间过长,它就会跳转到下一个文件。关于如何通过超时来做到这一点,有什么建议吗?
我没有仔细检查您的代码,无法准确地告诉您超时代码应该放在哪里,但是您应该能够轻松地使用Time::Out完成您想要的操作。只使用
use Time::Out 'timeout';
timeout 6 => sub {
# code that you want to time out after 6 seconds goes here
}