1 |
#!/usr/bin/perl -w |
2 |
|
3 |
# indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-06-19 |
4 |
# options: -q quiet |
5 |
# -d debug |
6 |
# -v verbose |
7 |
# -l limit regex |
8 |
|
9 |
# This indexer output xml data which is used to index content with |
10 |
# swish-e 2.2, http://www.swish-e.org/ |
11 |
# |
12 |
# xml is output is on STDOUT and informational oputput (for humas) is |
13 |
# on STDERR |
14 |
# |
15 |
|
16 |
use strict; |
17 |
use Getopt::Std; |
18 |
use Lingua::Spelling::Alternative; |
19 |
require Unicode::Map8; |
20 |
|
21 |
my $sadrzaj=0; |
22 |
my $nr=0; |
23 |
my $naslov=""; |
24 |
|
25 |
my $br; ## broj NN |
26 |
my $god; ## godina NN |
27 |
my $aname; ## ancor name na originalnim stranicama |
28 |
|
29 |
my $nn_dir="../"; # dir u kojem su wget-ani fileovi |
30 |
my $url="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%d#%d"; |
31 |
|
32 |
my %opts; |
33 |
getopts("vqdl:", \%opts); |
34 |
|
35 |
my $brojeva=0; |
36 |
my $zakona=0; |
37 |
|
38 |
my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} ); |
39 |
#$hr->load_affix("$nn_dir/search/croatian.aff"); |
40 |
$hr->load_findaffix("$nn_dir/prvih_50.txt"); |
41 |
|
42 |
my $l2_map = Unicode::Map8->new("ISO-8859-2") || die; |
43 |
|
44 |
#-------------------------------------------------------------------- |
45 |
sub dump_to_swish { |
46 |
my $xml = shift @_; |
47 |
my ($god,$br,$nr,$aname) = @_; |
48 |
|
49 |
use utf8; |
50 |
|
51 |
print "Path-Name: ".sprintf($url,$god,$br,$nr,$aname)."\n". |
52 |
"Content-Length: ".length($xml)."\n". |
53 |
"Document-Type: XML\n". |
54 |
"\n$xml"; |
55 |
} |
56 |
|
57 |
#-------------------------------------------------------------------- |
58 |
|
59 |
|
60 |
opendir(DIR,$nn_dir) || warn "opendir: $!"; |
61 |
my @files; |
62 |
if ($opts{l}) { |
63 |
# add limit regex |
64 |
@files = grep { /^CijeliBrojS/ && /$opts{l}/ && -f "$nn_dir/$_" } readdir(DIR); |
65 |
print STDERR "Using limit regex which is '$opts{l}'\n"; |
66 |
} else { |
67 |
@files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR); |
68 |
} |
69 |
closedir(DIR); |
70 |
|
71 |
foreach my $file (@files) { |
72 |
open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!"; |
73 |
|
74 |
if ($file=~m/god=(\d+)\&br=(\d+)/) { |
75 |
($br,$god) = ($2,$1); |
76 |
print STDERR "$file -- $2 -- $1\n" if (! $opts{q}); |
77 |
$brojeva++; |
78 |
} |
79 |
|
80 |
while(<IN>) { |
81 |
chomp; |
82 |
s/\015//g; # kill cr |
83 |
tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 |
84 |
|
85 |
if (m,<div class=sadrzaj>,) { |
86 |
$sadrzaj++; |
87 |
next; |
88 |
} |
89 |
|
90 |
if ($sadrzaj && m,</div>,) { |
91 |
$sadrzaj--; |
92 |
$naslov=~s/\s+/ /g; |
93 |
$naslov=~s/<[^>]+>//g; |
94 |
$naslov=~s/^\s+//g; |
95 |
$naslov=~s/\s+$//g; |
96 |
print STDERR "$god $br $nr: $naslov\n" if ($opts{v}); |
97 |
my $naslov_czs = lc($naslov); |
98 |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
99 |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
100 |
$naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs))); |
101 |
# $naslov_czs = $hr->minimal(split(/ /,$naslov_czs)); |
102 |
my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n"; |
103 |
$xml.="<naslov>". $l2_map->tou($naslov)->utf8 ."</naslov>\n"; |
104 |
$xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n"; |
105 |
dump_to_swish($xml,$god,$br,$nr,$aname); |
106 |
|
107 |
$naslov=""; |
108 |
$nr=0; |
109 |
$zakona++; |
110 |
} |
111 |
|
112 |
if ($sadrzaj) { |
113 |
if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) { |
114 |
($aname,$nr) = ($1,$2); |
115 |
} elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),'*(\w+)'*,(\d+)\)[^>]*>//i) { |
116 |
($nr,$aname) = ($3,$4); |
117 |
die "conflict in godina: $1 != $god" if ($god != $1); |
118 |
die "conflict in broj: $2 != $br" if ($br != $2); |
119 |
} else { |
120 |
die "can't find nr in line: $_"; |
121 |
} |
122 |
$naslov.=$_; |
123 |
$naslov=~s/^\s*$nr\.*\s*//g; |
124 |
} |
125 |
|
126 |
} |
127 |
|
128 |
close(IN); |
129 |
} |
130 |
|
131 |
print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q}); |
132 |
|