1 |
#!/usr/bin/perl -w |
#!/usr/bin/perl -w |
2 |
|
|
3 |
|
# indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2001-01-28 |
4 |
|
# options: -q quiet |
5 |
|
# -d debug |
6 |
|
# -v verbose |
7 |
|
|
8 |
use strict; |
use strict; |
9 |
use DBI; |
use DBI; |
10 |
|
use Getopt::Std; |
11 |
|
|
12 |
my $sadrzaj=0; |
my $sadrzaj=0; |
13 |
my $nr=0; |
my $nr=0; |
19 |
|
|
20 |
my $nn_dir="."; # dir u kojem su wget-ani fileovi |
my $nn_dir="."; # dir u kojem su wget-ani fileovi |
21 |
|
|
22 |
|
my %opts; |
23 |
|
getopt("vqd", \%opts); |
24 |
|
|
25 |
#-------------------------------------------------------------------- |
#-------------------------------------------------------------------- |
26 |
|
|
100 |
$tmp_word = $word.$add; |
$tmp_word = $word.$add; |
101 |
} |
} |
102 |
if ($tmp_word =~ m/$regexp/ix) { |
if ($tmp_word =~ m/$regexp/ix) { |
103 |
# print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n"; |
print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n" if ($opts{d}); |
104 |
push @out,lc($tmp_word); |
push @out,lc($tmp_word); |
105 |
} |
} |
106 |
} |
} |
122 |
my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR); |
my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR); |
123 |
closedir(DIR); |
closedir(DIR); |
124 |
|
|
125 |
|
my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr(); |
126 |
|
|
127 |
|
|
128 |
foreach my $file (@files) { |
foreach my $file (@files) { |
129 |
open(IN,$file) || die "can't open $file: $!"; |
open(IN,$file) || die "can't open $file: $!"; |
130 |
|
|
131 |
if ($file=~m/god=(\d+)\&br=(\d+)/) { |
if ($file=~m/god=(\d+)\&br=(\d+)/) { |
132 |
($br,$god) = ($2,$1); |
($br,$god) = ($2,$1); |
133 |
print "$file -- $2 -- $1\n"; |
print "$file -- $2 -- $1\n" if (! $opts{q}); |
134 |
} |
} |
135 |
|
|
136 |
while(<IN>) { |
while(<IN>) { |
149 |
$naslov=~s/<[^>]+>//g; |
$naslov=~s/<[^>]+>//g; |
150 |
$naslov=~s/^\s+//g; |
$naslov=~s/^\s+//g; |
151 |
$naslov=~s/\s+$//g; |
$naslov=~s/\s+$//g; |
152 |
print "$god $br $nr: $naslov\n"; |
print "$god $br $nr: $naslov\n" if ($opts{v}); |
153 |
my $naslov_czs = lc($naslov); |
my $naslov_czs = lc($naslov); |
154 |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
155 |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
156 |
$naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs))); |
$naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs))); |
157 |
$dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") || die $dbh->errstr(); |
$sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr(); |
158 |
$naslov=""; |
$naslov=""; |
159 |
$nr=0; |
$nr=0; |
160 |
} |
} |
162 |
if ($sadrzaj) { |
if ($sadrzaj) { |
163 |
if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) { |
if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) { |
164 |
($aname,$nr) = ($1,$2); |
($aname,$nr) = ($1,$2); |
165 |
} elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\d+),(\d+)\)[^>]*>//i) { |
} elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) { |
166 |
($nr,$aname) = ($3,$4); |
($nr,$aname) = ($3,$4); |
167 |
die "conflict in godina: $1 != $god" if ($god != $1); |
die "conflict in godina: $1 != $god" if ($god != $1); |
168 |
die "conflict in broj: $2 != $br" if ($br != $2); |
die "conflict in broj: $2 != $br" if ($br != $2); |