1 |
#!/usr/bin/perl -w |
2 |
|
3 |
use DBI; |
4 |
|
5 |
my $sadrzaj=0; |
6 |
my $nr=0; |
7 |
my $naslov=""; |
8 |
|
9 |
my $br; ## broj NN |
10 |
my $god; ## godina NN |
11 |
my $aname; ## ancor name na originalnim stranicama |
12 |
|
13 |
my $nn_dir="."; # dir u kojem su wget-ani fileovi |
14 |
|
15 |
my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr; |
16 |
|
17 |
$dbh->do("delete from nn") || die $dbh->errstr(); |
18 |
|
19 |
opendir(DIR,$nn_dir) || warn "opendir: $!"; |
20 |
my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR); |
21 |
closedir(DIR); |
22 |
|
23 |
foreach my $file (@files) { |
24 |
open(IN,$file) || die "can't open $file: $!"; |
25 |
|
26 |
if ($file=~m/god=(\d+)\&br=(\d+)/) { |
27 |
($br,$god) = ($2,$1); |
28 |
print "$file -- $2 -- $1\n"; |
29 |
} |
30 |
|
31 |
while(<IN>) { |
32 |
chomp; |
33 |
s/\015//g; # kill cr |
34 |
tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 |
35 |
|
36 |
$sadrzaj++ if (m,<div class=sadrzaj>,); |
37 |
|
38 |
if ($sadrzaj) { |
39 |
if (s/<a href="#([^"])+">\s*(\d+)\.\s*<[^>]+>//i) { |
40 |
($aname,$nr) = ($1,$2); |
41 |
} |
42 |
$naslov.=$_; |
43 |
} |
44 |
|
45 |
if ($sadrzaj && m,</div>,) { |
46 |
$sadrzaj--; |
47 |
$naslov=~s/\s+/ /g; |
48 |
$naslov=~s/<[^>]+>//g; |
49 |
$naslov=~s/^\s+//g; |
50 |
$naslov=~s/\s+$//g; |
51 |
print "$god $br $nr: $naslov\n"; |
52 |
$naslov_czs = lc($naslov); |
53 |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
54 |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
55 |
$dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") || die $dbh->errstr(); |
56 |
$naslov=""; |
57 |
$nr=0; |
58 |
} |
59 |
} |
60 |
|
61 |
close(IN); |
62 |
} |
63 |
|
64 |
$dbh->do("vacuum") || die $dbh->errstr(); |