16 |
use strict; |
use strict; |
17 |
use Getopt::Std; |
use Getopt::Std; |
18 |
use Lingua::Spelling::Alternative; |
use Lingua::Spelling::Alternative; |
19 |
|
require Unicode::Map8; |
20 |
|
|
21 |
my $sadrzaj=0; |
my $sadrzaj=0; |
22 |
my $nr=0; |
my $nr=0; |
27 |
my $aname; ## ancor name na originalnim stranicama |
my $aname; ## ancor name na originalnim stranicama |
28 |
|
|
29 |
my $nn_dir="../"; # dir u kojem su wget-ani fileovi |
my $nn_dir="../"; # dir u kojem su wget-ani fileovi |
30 |
|
my $url="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%d&mid=%d#%d"; |
31 |
|
|
32 |
my %opts; |
my %opts; |
33 |
getopts("vqdl:", \%opts); |
getopts("vqdl:", \%opts); |
39 |
#$hr->load_affix("$nn_dir/search/croatian.aff"); |
#$hr->load_affix("$nn_dir/search/croatian.aff"); |
40 |
$hr->load_findaffix("$nn_dir/prvih_50.txt"); |
$hr->load_findaffix("$nn_dir/prvih_50.txt"); |
41 |
|
|
42 |
|
my $l2_map = Unicode::Map8->new("ISO-8859-2") || die; |
43 |
|
|
44 |
|
#-------------------------------------------------------------------- |
45 |
|
sub dump_to_swish { |
46 |
|
my $xml = shift @_; |
47 |
|
my ($god,$br,$nr,$aname) = @_; |
48 |
|
|
49 |
|
use utf8; |
50 |
|
|
51 |
|
print "Path-Name: ".sprintf($url,$god,$br,$nr,$aname)."\n". |
52 |
|
"Content-Length: ".length($xml)."\n". |
53 |
|
"Document-Type: XML\n". |
54 |
|
"\n$xml"; |
55 |
|
} |
56 |
|
|
57 |
#-------------------------------------------------------------------- |
#-------------------------------------------------------------------- |
58 |
|
|
59 |
|
|
60 |
opendir(DIR,$nn_dir) || warn "opendir: $!"; |
opendir(DIR,$nn_dir) || warn "opendir: $!"; |
61 |
my @files; |
my @files; |
62 |
if ($opts{l}) { |
if ($opts{l}) { |
68 |
} |
} |
69 |
closedir(DIR); |
closedir(DIR); |
70 |
|
|
|
print "<xml>\n"; |
|
|
|
|
71 |
foreach my $file (@files) { |
foreach my $file (@files) { |
72 |
open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!"; |
open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!"; |
73 |
|
|
99 |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
100 |
$naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs))); |
$naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs))); |
101 |
# $naslov_czs = $hr->minimal(split(/ /,$naslov_czs)); |
# $naslov_czs = $hr->minimal(split(/ /,$naslov_czs)); |
102 |
print "<br>$br</br><god>$god</god><nr>$nr</nr><aname>$aname</aname>\n<naslov>$naslov</naslov>\n<naslov_czs>$naslov_czs</naslov_czs>\n"; |
my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n"; |
103 |
|
$xml.="<naslov>". $l2_map->tou($naslov)->utf8 ."</naslov>\n"; |
104 |
|
$xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n"; |
105 |
|
dump_to_swish($xml,$god,$br,$nr,$aname); |
106 |
|
|
107 |
$naslov=""; |
$naslov=""; |
108 |
$nr=0; |
$nr=0; |
109 |
$zakona++; |
$zakona++; |
128 |
close(IN); |
close(IN); |
129 |
} |
} |
130 |
|
|
|
print "</xml>\n"; |
|
|
|
|
131 |
print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q}); |
print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q}); |
132 |
|
|