1 |
#!/usr/bin/perl -w |
2 |
|
3 |
# indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-06-19 |
4 |
# options: -q quiet |
5 |
# -d debug |
6 |
# -v verbose |
7 |
# -l limit regex |
8 |
|
9 |
# This indexer output xml data which is used to index content with |
10 |
# swish-e 2.2, http://www.swish-e.org/ |
11 |
# |
12 |
# xml is output is on STDOUT and informational oputput (for humas) is |
13 |
# on STDERR |
14 |
# |
15 |
|
16 |
use strict; |
17 |
use Getopt::Std; |
18 |
require Unicode::Map8; |
19 |
use GDBM_File; |
20 |
|
21 |
my $sadrzaj=0; |
22 |
my $nr=0; |
23 |
my $naslov=""; |
24 |
|
25 |
my $br; ## broj NN |
26 |
my $god; ## godina NN |
27 |
my $aname; ## ancor name na originalnim stranicama |
28 |
|
29 |
my $nn_dir="../"; # dir u kojem su wget-ani fileovi |
30 |
#my $path_fmt="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d"; |
31 |
|
32 |
# configure gdbm files here |
33 |
my $gdbm_brzakona="$nn_dir/swish/brzakona.gdbm"; |
34 |
my $gdbm_file2title="$nn_dir/swish/file2title.gdbm"; |
35 |
|
36 |
# where to drop full text URLs |
37 |
my $full_url_list="$nn_dir/sluzbeno/in.url"; |
38 |
|
39 |
# URL to original site |
40 |
my $full_url="http://www.nn.hr/clanci/sluzbeno/"; |
41 |
my $full_filename_fmt="%04d/%04s.htm"; |
42 |
my $path_fmt = $full_filename_fmt; |
43 |
|
44 |
# regex for filenames |
45 |
my $broj_html_re = qr/^(CijeliBrojS|pregled.asp)/; |
46 |
|
47 |
my %opts; |
48 |
getopts("vqdl:", \%opts); |
49 |
|
50 |
my $brojeva=0; |
51 |
my $zakona=0; |
52 |
my $zak_u_broju; |
53 |
|
54 |
|
55 |
my $l2_map = Unicode::Map8->new("ISO-8859-2") || die; |
56 |
|
57 |
my %br_zakona; |
58 |
tie %br_zakona, 'GDBM_File', $gdbm_brzakona.".temp", &GDBM_NEWDB, 0644; |
59 |
my %file2title; |
60 |
tie %file2title, 'GDBM_File', $gdbm_file2title.".temp", &GDBM_NEWDB, 0644; |
61 |
|
62 |
#-------------------------------------------------------------------- |
63 |
|
64 |
sub save_br_zak { |
65 |
my $god = shift || return; |
66 |
my $br = shift || return; |
67 |
my $zak_u_broju = shift || return; |
68 |
print STDERR "[$god/$br: $zak_u_broju zakona]\n" if (! $opts{q}); |
69 |
if (! $br_zakona{sprintf("%04d",$god)}) { |
70 |
$br_zakona{sprintf("%04d",$god)} = $zak_u_broju; |
71 |
} else { |
72 |
$br_zakona{sprintf("%04d",$god)} += $zak_u_broju; |
73 |
} |
74 |
} |
75 |
|
76 |
#-------------------------------------------------------------------- |
77 |
|
78 |
sub dump_to_swish { |
79 |
my $xml = shift @_; |
80 |
my ($god,$br,$nr,$aname) = @_; |
81 |
|
82 |
use utf8; |
83 |
|
84 |
# print "Path-Name: ".sprintf($path_fmt,$god,$br,$nr,$aname)."\n". |
85 |
print "Path-Name: ".sprintf($path_fmt,$god,$nr)."\n". |
86 |
"Content-Length: ".length($xml)."\n". |
87 |
"Document-Type: XML\n". |
88 |
"\n$xml"; |
89 |
} |
90 |
|
91 |
#-------------------------------------------------------------------- |
92 |
|
93 |
open(URL,"> $full_url_list") || warn "can't open URL list file '$full_url_list': $!"; |
94 |
|
95 |
opendir(DIR,$nn_dir) || warn "opendir: $!"; |
96 |
my @files; |
97 |
if ($opts{l}) { |
98 |
# add limit regex |
99 |
@files = grep { $_ =~ $broj_html_re && /$opts{l}/ && -f "$nn_dir/$_" } readdir(DIR); |
100 |
print STDERR "Using limit regex which is '$opts{l}'\n"; |
101 |
} else { |
102 |
@files = grep { $_ =~ $broj_html_re && -f "$nn_dir/$_" } readdir(DIR); |
103 |
} |
104 |
closedir(DIR); |
105 |
|
106 |
foreach my $file (sort @files) { |
107 |
open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!"; |
108 |
|
109 |
if ($file=~m/(?:god|godina)=(\d+)\&(?:br|broj)=(\d+)/) { |
110 |
if ($god && $br && $zak_u_broju) { |
111 |
save_br_zak($god,$br,$zak_u_broju); |
112 |
} |
113 |
($br,$god) = ($2,$1); |
114 |
$brojeva++; |
115 |
$zak_u_broju = 0; |
116 |
print STDERR "$file $god $br\n" if (! $opts{q}); |
117 |
} |
118 |
|
119 |
my $insert_in_swish = 0; |
120 |
|
121 |
while(<IN>) { |
122 |
chomp; |
123 |
s/\015//g; # kill cr |
124 |
tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 |
125 |
|
126 |
# |
127 |
# parse old pages (CijeliBrojS.asp) with <div class=sadrzaj> |
128 |
# |
129 |
|
130 |
if (m,<div class=sadrzaj>,) { |
131 |
$sadrzaj++; |
132 |
next; |
133 |
} |
134 |
|
135 |
if ($sadrzaj) { |
136 |
if (s/<a href="#([^"]+)">\s*(\S+)\.\s*<[^>]+>//i) { |
137 |
($aname,$nr) = ($1,$2); |
138 |
} elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),'*(\w+)'*,(\d+)\)[^>]*>//i) { |
139 |
($nr,$aname) = ($3,$4); |
140 |
die "conflict in godina: $1 != $god" if ($god != $1); |
141 |
die "conflict in broj: $2 != $br" if ($br != $2); |
142 |
} else { |
143 |
die "can't find nr in line: $_ [$file]"; |
144 |
} |
145 |
$naslov.=$_; |
146 |
$naslov=~s/^\s*$nr\.*\s*//g; |
147 |
$sadrzaj = 0; |
148 |
$insert_in_swish = 1; |
149 |
} |
150 |
|
151 |
# |
152 |
# new pregled.asp format |
153 |
# |
154 |
|
155 |
if (m#<A TARGET="ispis" HREF="/clanci/sluzbeno/(\d+)/(\d+).htm">\s*(\d+)\.*\s+([^<]+)</A>#) { |
156 |
($god, $nr, $aname, $naslov) = ($1,$2,$3,$4); |
157 |
$naslov=~s/^\s*$nr\.*\s*//g; |
158 |
$insert_in_swish = 1; |
159 |
} |
160 |
|
161 |
if ($insert_in_swish) { |
162 |
$insert_in_swish = 0; |
163 |
$naslov=~s/\s+/ /g; |
164 |
$naslov=~s/<[^>]+>//g; |
165 |
$naslov=~s/^\s+//g; |
166 |
$naslov=~s/\s+$//g; |
167 |
print STDERR "$god $br $nr: $naslov\n" if ($opts{v}); |
168 |
my $naslov_czs = lc($naslov); |
169 |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
170 |
$naslov_czs =~ tr/a-zA-Z0-9/ /cs; # non a-z -> space |
171 |
# $naslov_czs = $hr->minimal(split(/ /,$naslov_czs)); |
172 |
my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n"; |
173 |
my $naslov_utf=$l2_map->tou($naslov)->utf8; |
174 |
|
175 |
# Escape <, >, & and ", and to produce valid XML |
176 |
my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"'); |
177 |
my $escape_re = join '|' => keys %escape; |
178 |
$naslov_utf =~ s/($escape_re)/$escape{$1}/g; |
179 |
|
180 |
$xml.="<naslov>$naslov_utf</naslov>\n"; |
181 |
$xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n"; |
182 |
dump_to_swish($xml,$god,$br,$nr,$aname); |
183 |
|
184 |
my $file = sprintf($full_filename_fmt,$god,$nr); |
185 |
if (! -e "$nn_dir/sluzbeno/$file") { |
186 |
print URL $full_url.$file."\n"; |
187 |
} |
188 |
$file2title{$file}="$god $br $nr $aname $naslov"; |
189 |
|
190 |
$naslov=""; |
191 |
$nr=0; |
192 |
$zakona++; |
193 |
$zak_u_broju++; |
194 |
} |
195 |
|
196 |
|
197 |
} |
198 |
|
199 |
close(IN); |
200 |
} |
201 |
|
202 |
save_br_zak($god,$br,$zak_u_broju); |
203 |
print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q}); |
204 |
|
205 |
untie %br_zakona; |
206 |
|
207 |
# rename temp gdbm files |
208 |
rename $gdbm_brzakona.".temp",$gdbm_brzakona || die "can't rename $gdbm_brzakona: $!"; |
209 |
rename $gdbm_file2title.".temp",$gdbm_file2title || die "can't rename $gdbm_file2title: $!"; |