--- trunk/swish/html2xml.pl 2003/09/03 15:34:58 87 +++ trunk/swish/html2xml.pl 2003/09/28 02:19:59 91 @@ -15,7 +15,6 @@ use strict; use Getopt::Std; -use Lingua::Spelling::Alternative; require Unicode::Map8; use GDBM_File; @@ -28,9 +27,19 @@ my $aname; ## ancor name na originalnim stranicama my $nn_dir="../"; # dir u kojem su wget-ani fileovi -my $url="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d"; +#my $path_fmt="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d"; -my $gdbm_file="./brzakona.gdbm"; +# configure gdbm files here +my $gdbm_brzakona="$nn_dir/swish/brzakona.gdbm"; +my $gdbm_file2title="$nn_dir/swish/file2title.gdbm"; + +# where to drop full text URLs +my $full_url_list="$nn_dir/sluzbeno/in.url"; + +# URL to original site +my $full_url="http://www.nn.hr/clanci/sluzbeno/"; +my $full_filename_fmt="%04d/%04s.htm"; +my $path_fmt = $full_filename_fmt; my %opts; getopts("vqdl:", \%opts); @@ -39,14 +48,13 @@ my $zakona=0; my $zak_u_broju; -my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} ); -#$hr->load_affix("$nn_dir/search/croatian.aff"); -$hr->load_findaffix("$nn_dir/prvih_50.txt"); my $l2_map = Unicode::Map8->new("ISO-8859-2") || die; my %br_zakona; -tie %br_zakona, 'GDBM_File', $gdbm_file, &GDBM_NEWDB, 0640; +tie %br_zakona, 'GDBM_File', $gdbm_brzakona.".temp", &GDBM_NEWDB, 0644; +my %file2title; +tie %file2title, 'GDBM_File', $gdbm_file2title.".temp", &GDBM_NEWDB, 0644; #-------------------------------------------------------------------- @@ -55,7 +63,11 @@ my $br = shift || return; my $zak_u_broju = shift || return; print STDERR "[$god/$br: $zak_u_broju zakona]\n" if (! $opts{q}); - $br_zakona{sprintf("%04d/%02d",$god,$br)} = $zak_u_broju; + if (! $br_zakona{sprintf("%04d",$god)}) { + $br_zakona{sprintf("%04d",$god)} = $zak_u_broju; + } else { + $br_zakona{sprintf("%04d",$god)} += $zak_u_broju; + } } #-------------------------------------------------------------------- @@ -66,7 +78,8 @@ use utf8; - print "Path-Name: ".sprintf($url,$god,$br,$nr,$aname)."\n". +# print "Path-Name: ".sprintf($path_fmt,$god,$br,$nr,$aname)."\n". + print "Path-Name: ".sprintf($path_fmt,$god,$nr)."\n". "Content-Length: ".length($xml)."\n". "Document-Type: XML\n". "\n$xml"; @@ -74,6 +87,7 @@ #-------------------------------------------------------------------- +open(URL,"> $full_url_list") || warn "can't open URL list file '$full_url_list': $!"; opendir(DIR,$nn_dir) || warn "opendir: $!"; my @files; @@ -117,7 +131,6 @@ my $naslov_czs = lc($naslov); $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space - $naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs))); # $naslov_czs = $hr->minimal(split(/ /,$naslov_czs)); my $xml="\n
$br
\n$god\n$nr\n$aname\n"; my $naslov_utf=$l2_map->tou($naslov)->utf8; @@ -130,7 +143,11 @@ $xml.="$naslov_utf\n"; $xml.="$naslov_czs\n
\n\n"; dump_to_swish($xml,$god,$br,$nr,$aname); - + + my $file = sprintf($full_filename_fmt,$god,$nr); + print URL $full_url.$file."\n"; + $file2title{$file}="$god $br $nr $aname $naslov"; + $naslov=""; $nr=0; $zakona++; @@ -160,3 +177,7 @@ print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q}); untie %br_zakona; + +# rename temp gdbm files +rename $gdbm_brzakona.".temp",$gdbm_brzakona || die "can't rename $gdbm_brzakona: $!"; +rename $gdbm_file2title.".temp",$gdbm_file2title || die "can't rename $gdbm_file2title: $!";