--- trunk/swish/html2xml.pl 2003/09/03 15:34:58 87
+++ trunk/swish/html2xml.pl 2003/09/28 02:19:59 91
@@ -15,7 +15,6 @@
use strict;
use Getopt::Std;
-use Lingua::Spelling::Alternative;
require Unicode::Map8;
use GDBM_File;
@@ -28,9 +27,19 @@
my $aname; ## ancor name na originalnim stranicama
my $nn_dir="../"; # dir u kojem su wget-ani fileovi
-my $url="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d";
+#my $path_fmt="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d";
-my $gdbm_file="./brzakona.gdbm";
+# configure gdbm files here
+my $gdbm_brzakona="$nn_dir/swish/brzakona.gdbm";
+my $gdbm_file2title="$nn_dir/swish/file2title.gdbm";
+
+# where to drop full text URLs
+my $full_url_list="$nn_dir/sluzbeno/in.url";
+
+# URL to original site
+my $full_url="http://www.nn.hr/clanci/sluzbeno/";
+my $full_filename_fmt="%04d/%04s.htm";
+my $path_fmt = $full_filename_fmt;
my %opts;
getopts("vqdl:", \%opts);
@@ -39,14 +48,13 @@
my $zakona=0;
my $zak_u_broju;
-my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} );
-#$hr->load_affix("$nn_dir/search/croatian.aff");
-$hr->load_findaffix("$nn_dir/prvih_50.txt");
my $l2_map = Unicode::Map8->new("ISO-8859-2") || die;
my %br_zakona;
-tie %br_zakona, 'GDBM_File', $gdbm_file, &GDBM_NEWDB, 0640;
+tie %br_zakona, 'GDBM_File', $gdbm_brzakona.".temp", &GDBM_NEWDB, 0644;
+my %file2title;
+tie %file2title, 'GDBM_File', $gdbm_file2title.".temp", &GDBM_NEWDB, 0644;
#--------------------------------------------------------------------
@@ -55,7 +63,11 @@
my $br = shift || return;
my $zak_u_broju = shift || return;
print STDERR "[$god/$br: $zak_u_broju zakona]\n" if (! $opts{q});
- $br_zakona{sprintf("%04d/%02d",$god,$br)} = $zak_u_broju;
+ if (! $br_zakona{sprintf("%04d",$god)}) {
+ $br_zakona{sprintf("%04d",$god)} = $zak_u_broju;
+ } else {
+ $br_zakona{sprintf("%04d",$god)} += $zak_u_broju;
+ }
}
#--------------------------------------------------------------------
@@ -66,7 +78,8 @@
use utf8;
- print "Path-Name: ".sprintf($url,$god,$br,$nr,$aname)."\n".
+# print "Path-Name: ".sprintf($path_fmt,$god,$br,$nr,$aname)."\n".
+ print "Path-Name: ".sprintf($path_fmt,$god,$nr)."\n".
"Content-Length: ".length($xml)."\n".
"Document-Type: XML\n".
"\n$xml";
@@ -74,6 +87,7 @@
#--------------------------------------------------------------------
+open(URL,"> $full_url_list") || warn "can't open URL list file '$full_url_list': $!";
opendir(DIR,$nn_dir) || warn "opendir: $!";
my @files;
@@ -117,7 +131,6 @@
my $naslov_czs = lc($naslov);
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/;
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
- $naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs)));
# $naslov_czs = $hr->minimal(split(/ /,$naslov_czs));
my $xml="\n
$br\n$god\n$nr\n$aname\n";
my $naslov_utf=$l2_map->tou($naslov)->utf8;
@@ -130,7 +143,11 @@
$xml.="$naslov_utf\n";
$xml.="$naslov_czs\n\n\n";
dump_to_swish($xml,$god,$br,$nr,$aname);
-
+
+ my $file = sprintf($full_filename_fmt,$god,$nr);
+ print URL $full_url.$file."\n";
+ $file2title{$file}="$god $br $nr $aname $naslov";
+
$naslov="";
$nr=0;
$zakona++;
@@ -160,3 +177,7 @@
print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
untie %br_zakona;
+
+# rename temp gdbm files
+rename $gdbm_brzakona.".temp",$gdbm_brzakona || die "can't rename $gdbm_brzakona: $!";
+rename $gdbm_file2title.".temp",$gdbm_file2title || die "can't rename $gdbm_file2title: $!";