41 |
my $full_filename_fmt="%04d/%04s.htm"; |
my $full_filename_fmt="%04d/%04s.htm"; |
42 |
my $path_fmt = $full_filename_fmt; |
my $path_fmt = $full_filename_fmt; |
43 |
|
|
44 |
|
# regex for filenames |
45 |
|
my $broj_html_re = qr/^(CijeliBrojS|pregled.asp)/; |
46 |
|
|
47 |
my %opts; |
my %opts; |
48 |
getopts("vqdl:", \%opts); |
getopts("vqdl:", \%opts); |
49 |
|
|
96 |
my @files; |
my @files; |
97 |
if ($opts{l}) { |
if ($opts{l}) { |
98 |
# add limit regex |
# add limit regex |
99 |
@files = grep { /^CijeliBrojS/ && /$opts{l}/ && -f "$nn_dir/$_" } readdir(DIR); |
@files = grep { $_ =~ $broj_html_re && /$opts{l}/ && -f "$nn_dir/$_" } readdir(DIR); |
100 |
print STDERR "Using limit regex which is '$opts{l}'\n"; |
print STDERR "Using limit regex which is '$opts{l}'\n"; |
101 |
} else { |
} else { |
102 |
@files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR); |
@files = grep { $_ =~ $broj_html_re && -f "$nn_dir/$_" } readdir(DIR); |
103 |
} |
} |
104 |
closedir(DIR); |
closedir(DIR); |
105 |
|
|
106 |
foreach my $file (sort @files) { |
foreach my $file (sort @files) { |
107 |
open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!"; |
open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!"; |
108 |
|
|
109 |
if ($file=~m/god=(\d+)\&br=(\d+)/) { |
if ($file=~m/(?:god|godina)=(\d+)\&(?:br|broj)=(\d+)/) { |
110 |
save_br_zak($god,$br,$zak_u_broju); |
if ($god && $br && $zak_u_broju) { |
111 |
print STDERR "$file " if (! $opts{q}); |
save_br_zak($god,$br,$zak_u_broju); |
112 |
|
} |
113 |
($br,$god) = ($2,$1); |
($br,$god) = ($2,$1); |
114 |
$brojeva++; |
$brojeva++; |
115 |
$zak_u_broju = 0; |
$zak_u_broju = 0; |
116 |
|
print STDERR "$file $god $br\n" if (! $opts{q}); |
117 |
} |
} |
118 |
|
|
119 |
|
my $insert_in_swish = 0; |
120 |
|
|
121 |
while(<IN>) { |
while(<IN>) { |
122 |
chomp; |
chomp; |
123 |
s/\015//g; # kill cr |
s/\015//g; # kill cr |
124 |
tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 |
tr/šðžèæŠÐŽÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 |
125 |
|
|
126 |
|
# |
127 |
|
# parse old pages (CijeliBrojS.asp) with <div class=sadrzaj> |
128 |
|
# |
129 |
|
|
130 |
if (m,<div class=sadrzaj>,) { |
if (m,<div class=sadrzaj>,) { |
131 |
$sadrzaj++; |
$sadrzaj++; |
132 |
next; |
next; |
133 |
} |
} |
134 |
|
|
135 |
if ($sadrzaj && m,</div>,) { |
if ($sadrzaj) { |
136 |
$sadrzaj--; |
if (s/<a href="#([^"]+)">\s*(\S+)\.\s*<[^>]+>//i) { |
137 |
|
($aname,$nr) = ($1,$2); |
138 |
|
} elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),'*(\w+)'*,(\d+)\)[^>]*>//i) { |
139 |
|
($nr,$aname) = ($3,$4); |
140 |
|
die "conflict in godina: $1 != $god" if ($god != $1); |
141 |
|
die "conflict in broj: $2 != $br" if ($br != $2); |
142 |
|
} else { |
143 |
|
die "can't find nr in line: $_ [$file]"; |
144 |
|
} |
145 |
|
$naslov.=$_; |
146 |
|
$naslov=~s/^\s*$nr\.*\s*//g; |
147 |
|
$sadrzaj = 0; |
148 |
|
$insert_in_swish = 1; |
149 |
|
} |
150 |
|
|
151 |
|
# |
152 |
|
# new pregled.asp format |
153 |
|
# |
154 |
|
|
155 |
|
if (m#<A TARGET="ispis" HREF="/clanci/sluzbeno/(\d+)/(\d+).htm">\s*(\d+)\.*\s+([^<]+)</A>#) { |
156 |
|
($god, $nr, $aname, $naslov) = ($1,$2,$3,$4); |
157 |
|
$naslov=~s/^\s*$nr\.*\s*//g; |
158 |
|
$insert_in_swish = 1; |
159 |
|
} |
160 |
|
|
161 |
|
if ($insert_in_swish) { |
162 |
|
$insert_in_swish = 0; |
163 |
$naslov=~s/\s+/ /g; |
$naslov=~s/\s+/ /g; |
164 |
$naslov=~s/<[^>]+>//g; |
$naslov=~s/<[^>]+>//g; |
165 |
$naslov=~s/^\s+//g; |
$naslov=~s/^\s+//g; |
167 |
print STDERR "$god $br $nr: $naslov\n" if ($opts{v}); |
print STDERR "$god $br $nr: $naslov\n" if ($opts{v}); |
168 |
my $naslov_czs = lc($naslov); |
my $naslov_czs = lc($naslov); |
169 |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
170 |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
$naslov_czs =~ tr/a-zA-Z0-9/ /cs; # non a-z -> space |
171 |
# $naslov_czs = $hr->minimal(split(/ /,$naslov_czs)); |
# $naslov_czs = $hr->minimal(split(/ /,$naslov_czs)); |
172 |
my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n"; |
my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n"; |
173 |
my $naslov_utf=$l2_map->tou($naslov)->utf8; |
my $naslov_utf=$l2_map->tou($naslov)->utf8; |
193 |
$zak_u_broju++; |
$zak_u_broju++; |
194 |
} |
} |
195 |
|
|
|
if ($sadrzaj) { |
|
|
if (s/<a href="#([^"]+)">\s*(\S+)\.\s*<[^>]+>//i) { |
|
|
($aname,$nr) = ($1,$2); |
|
|
} elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),'*(\w+)'*,(\d+)\)[^>]*>//i) { |
|
|
($nr,$aname) = ($3,$4); |
|
|
die "conflict in godina: $1 != $god" if ($god != $1); |
|
|
die "conflict in broj: $2 != $br" if ($br != $2); |
|
|
} else { |
|
|
die "can't find nr in line: $_"; |
|
|
} |
|
|
$naslov.=$_; |
|
|
$naslov=~s/^\s*$nr\.*\s*//g; |
|
|
} |
|
196 |
|
|
197 |
} |
} |
198 |
|
|